From 2d22c9abbd20fdc3d3a469075bd151ca1bd02ee4 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 26 Jul 2023 21:11:12 +0200
Subject: [PATCH 1/3] Vendor import of llvm-project branch release/17.x
 llvmorg-17-init-19311-gbc849e525f80.

---
 clang/lib/Basic/Targets/LoongArch.cpp         |  25 +-
 clang/lib/Basic/Targets/LoongArch.h           |   3 +
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |  23 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  22 +-
 libcxx/include/__mdspan/extents.h             |  63 ++--
 libcxx/include/__mdspan/layout_left.h         |  30 +-
 libcxx/include/__mdspan/layout_right.h        |  30 +-
 libcxx/include/__mdspan/mdspan.h              | 308 ++++++++++++++++++
 libcxx/include/mdspan                         | 130 ++++++++
 libcxx/include/module.modulemap.in            |   1 +
 libcxx/modules/std/mdspan.cppm                |   2 +-
 .../llvm/TargetParser/LoongArchTargetParser.h |   9 +-
 .../Transforms/IPO/FunctionSpecialization.h   |  15 +-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |   4 +-
 llvm/lib/Target/LoongArch/LoongArch.td        |   5 +
 .../TargetParser/LoongArchTargetParser.cpp    |  25 ++
 .../Transforms/IPO/FunctionSpecialization.cpp |  82 +----
 openmp/runtime/src/ompt-event-specific.h      |  13 +-
 18 files changed, 644 insertions(+), 146 deletions(-)
 create mode 100644 libcxx/include/__mdspan/mdspan.h
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 6958479cd7c4..f08e5e732b03 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -15,7 +15,7 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/TargetParser.h"
+#include "llvm/TargetParser/LoongArchTargetParser.h"
 
 using namespace clang;
 using namespace clang::targets;
@@ -198,7 +198,19 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
   else
     Builder.defineMacro("__loongarch_frlen", "0");
 
-  // TODO: define __loongarch_arch and __loongarch_tune.
+  // Define __loongarch_arch.
+  StringRef Arch = llvm::LoongArch::getArch();
+  if (Arch.empty())
+    Arch = llvm::LoongArch::getDefaultArch(GRLen == 64);
+  if (!Arch.empty())
+    Builder.defineMacro("__loongarch_arch", Arch);
+
+  // Define __loongarch_tune.
+  StringRef TuneCPU = llvm::LoongArch::getTuneCPU();
+  if (TuneCPU.empty())
+    TuneCPU = Arch;
+  if (!TuneCPU.empty())
+    Builder.defineMacro("__loongarch_tune", TuneCPU);
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -270,3 +282,12 @@ bool LoongArchTargetInfo::handleTargetFeatures(
   }
   return true;
 }
+
+bool LoongArchTargetInfo::isValidTuneCPUName(StringRef Name) const {
+  return llvm::LoongArch::isValidTuneCPUName(Name);
+}
+
+void LoongArchTargetInfo::fillValidTuneCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  llvm::LoongArch::fillValidTuneCPUList(Values);
+}
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 52c4ce425368..60d545566b30 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -80,6 +80,9 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
                  const std::vector<std::string> &FeaturesVec) const override;
 
   bool hasFeature(StringRef Feature) const override;
+
+  bool isValidTuneCPUName(StringRef Name) const override;
+  void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
 };
 
 class LLVM_LIBRARY_VISIBILITY LoongArch32TargetInfo
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 856ad58f3bd9..6cbb06b9a91f 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -12,6 +12,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
+#include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
 
 using namespace clang::driver;
@@ -128,21 +129,29 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            std::vector<StringRef> &Features) {
   StringRef ArchName;
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
-    if (!llvm::LoongArch::isValidArchName(A->getValue())) {
+    ArchName = A->getValue();
+
+    // Handle -march=native.
+    if (ArchName == "native") {
+      ArchName = llvm::sys::getHostCPUName();
+      if (ArchName == "generic")
+        ArchName = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+    }
+
+    if (!llvm::LoongArch::isValidArchName(ArchName)) {
       D.Diag(clang::diag::err_drv_invalid_arch_name) << A->getAsString(Args);
       return;
     }
-    ArchName = A->getValue();
   }
 
-  // TODO: handle -march=native and -mtune=xx.
-
   // Select a default arch name.
-  if (ArchName.empty() && Triple.isLoongArch64())
-    ArchName = "loongarch64";
+  if (ArchName.empty())
+    ArchName = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
 
-  if (!ArchName.empty())
+  if (!ArchName.empty()) {
     llvm::LoongArch::getArchFeatures(ArchName, Features);
+    llvm::LoongArch::setArch(ArchName);
+  }
 
   // Select floating-point features determined by -mdouble-float,
   // -msingle-float, -msoft-float and -mfpu.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index adb550d9c5da..e3fa315ffcb1 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/LoongArchTargetParser.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include <cctype>
 
@@ -1853,10 +1854,25 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
 void Clang::AddLoongArchTargetArgs(const ArgList &Args,
                                    ArgStringList &CmdArgs) const {
+  const llvm::Triple &Triple = getToolChain().getTriple();
+
   CmdArgs.push_back("-target-abi");
-  CmdArgs.push_back(loongarch::getLoongArchABI(getToolChain().getDriver(), Args,
-                                               getToolChain().getTriple())
-                        .data());
+  CmdArgs.push_back(
+      loongarch::getLoongArchABI(getToolChain().getDriver(), Args, Triple)
+          .data());
+
+  // Handle -mtune.
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
+    StringRef TuneCPU = A->getValue();
+    if (TuneCPU == "native") {
+      TuneCPU = llvm::sys::getHostCPUName();
+      if (TuneCPU == "generic")
+        TuneCPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+    }
+    CmdArgs.push_back("-tune-cpu");
+    CmdArgs.push_back(Args.MakeArgString(TuneCPU));
+    llvm::LoongArch::setTuneCPU(TuneCPU);
+  }
 }
 
 void Clang::AddMIPSTargetArgs(const ArgList &Args,
diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index 42355678d60c..a510220d4096 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -171,11 +171,14 @@ struct __maybe_static_array {
       _TStatic __static_val = _StaticValues::__get(__i);
       if (__static_val == _DynTag) {
         __dyn_vals_[_DynamicIdxMap::__get(__i)] = __values[__i];
-      }
-      // Precondition check
-      else
-        _LIBCPP_ASSERT_UNCATEGORIZED(__values[__i] == static_cast<_TDynamic>(__static_val),
-                                     "extents construction: mismatch of provided arguments with static extents.");
+      } else
+        // Not catching this could lead to out of bounds errors later
+        // e.g. using my_mdspan_t = mdspan<int, extents<int, 10>>; my_mdspan_t = m(new int[5], 5);
+        // Right-hand-side construction looks ok with allocation and size matching,
+        // but since (potentially elsewhere defined) my_mdspan_t has static size m now thinks its range is 10 not 5
+        _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+            __values[__i] == static_cast<_TDynamic>(__static_val),
+            "extents construction: mismatch of provided arguments with static extents.");
     }
   }
 
@@ -187,11 +190,14 @@ struct __maybe_static_array {
       _TStatic __static_val = _StaticValues::__get(__i);
       if (__static_val == _DynTag) {
         __dyn_vals_[_DynamicIdxMap::__get(__i)] = static_cast<_TDynamic>(__vals[__i]);
-      }
-      // Precondition check
-      else
-        _LIBCPP_ASSERT_UNCATEGORIZED(static_cast<_TDynamic>(__vals[__i]) == static_cast<_TDynamic>(__static_val),
-                                     "extents construction: mismatch of provided arguments with static extents.");
+      } else
+        // Not catching this could lead to out of bounds errors later
+        // e.g. using my_mdspan_t = mdspan<int, extents<int, 10>>; my_mdspan_t = m(new int[N], span<int,1>(&N));
+        // Right-hand-side construction looks ok with allocation and size matching,
+        // but since (potentially elsewhere defined) my_mdspan_t has static size m now thinks its range is 10 not N
+        _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+            static_cast<_TDynamic>(__vals[__i]) == static_cast<_TDynamic>(__static_val),
+            "extents construction: mismatch of provided arguments with static extents.");
     }
   }
 
@@ -310,28 +316,37 @@ class extents {
              (sizeof...(_OtherIndexTypes) == __rank_ || sizeof...(_OtherIndexTypes) == __rank_dynamic_))
   _LIBCPP_HIDE_FROM_ABI constexpr explicit extents(_OtherIndexTypes... __dynvals) noexcept
       : __vals_(static_cast<index_type>(__dynvals)...) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__mdspan_detail::__are_representable_as<index_type>(__dynvals...),
-                                 "extents ctor: arguments must be representable as index_type and nonnegative");
+    // Not catching this could lead to out of bounds errors later
+    // e.g. mdspan m(ptr, dextents<char, 1>(200u)); leads to an extent of -56 on m
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__mdspan_detail::__are_representable_as<index_type>(__dynvals...),
+                                        "extents ctor: arguments must be representable as index_type and nonnegative");
   }
 
   template <class _OtherIndexType, size_t _Size>
-    requires(is_convertible_v<_OtherIndexType, index_type> && is_nothrow_constructible_v<index_type, _OtherIndexType> &&
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&> &&
              (_Size == __rank_ || _Size == __rank_dynamic_))
   explicit(_Size != __rank_dynamic_)
       _LIBCPP_HIDE_FROM_ABI constexpr extents(const array<_OtherIndexType, _Size>& __exts) noexcept
       : __vals_(span(__exts)) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__mdspan_detail::__are_representable_as<index_type>(span(__exts)),
-                                 "extents ctor: arguments must be representable as index_type and nonnegative");
+    // Not catching this could lead to out of bounds errors later
+    // e.g. mdspan m(ptr, dextents<char, 1>(array<unsigned,1>(200))); leads to an extent of -56 on m
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__mdspan_detail::__are_representable_as<index_type>(span(__exts)),
+                                        "extents ctor: arguments must be representable as index_type and nonnegative");
   }
 
   template <class _OtherIndexType, size_t _Size>
-    requires(is_convertible_v<_OtherIndexType, index_type> && is_nothrow_constructible_v<index_type, _OtherIndexType> &&
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&> &&
              (_Size == __rank_ || _Size == __rank_dynamic_))
   explicit(_Size != __rank_dynamic_)
       _LIBCPP_HIDE_FROM_ABI constexpr extents(const span<_OtherIndexType, _Size>& __exts) noexcept
       : __vals_(__exts) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__mdspan_detail::__are_representable_as<index_type>(__exts),
-                                 "extents ctor: arguments must be representable as index_type and nonnegative");
+    // Not catching this could lead to out of bounds errors later
+    // e.g. array a{200u}; mdspan<int, dextents<char,1>> m(ptr, extents(span<unsigned,1>(a))); leads to an extent of -56
+    // on m
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__mdspan_detail::__are_representable_as<index_type>(__exts),
+                                        "extents ctor: arguments must be representable as index_type and nonnegative");
   }
 
 private:
@@ -380,10 +395,16 @@ class extents {
       for (size_t __r = 0; __r < rank(); __r++) {
         if constexpr (static_cast<make_unsigned_t<index_type>>(numeric_limits<index_type>::max()) <
                       static_cast<make_unsigned_t<_OtherIndexType>>(numeric_limits<_OtherIndexType>::max())) {
-          _LIBCPP_ASSERT_UNCATEGORIZED(__mdspan_detail::__is_representable_as<index_type>(__other.extent(__r)),
-                                       "extents ctor: arguments must be representable as index_type and nonnegative");
+          // Not catching this could lead to out of bounds errors later
+          // e.g. dextents<char,1>> e(dextents<unsigned,1>(200)) leads to an extent of -56 on e
+          _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+              __mdspan_detail::__is_representable_as<index_type>(__other.extent(__r)),
+              "extents ctor: arguments must be representable as index_type and nonnegative");
         }
-        _LIBCPP_ASSERT_UNCATEGORIZED(
+        // Not catching this could lead to out of bounds errors later
+        // e.g. mdspan<int, extents<int, 10>> m = mdspan<int, dextents<int, 1>>(new int[5], 5);
+        // Right-hand-side construction was ok, but m now thinks its range is 10 not 5
+        _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
             (_Values::__static_value(__r) == dynamic_extent) ||
                 (static_cast<index_type>(__other.extent(__r)) == static_cast<index_type>(_Values::__static_value(__r))),
             "extents construction: mismatch of provided arguments with static extents.");
diff --git a/libcxx/include/__mdspan/layout_left.h b/libcxx/include/__mdspan/layout_left.h
index e81e0d10b595..f890c5ae0256 100644
--- a/libcxx/include/__mdspan/layout_left.h
+++ b/libcxx/include/__mdspan/layout_left.h
@@ -75,8 +75,11 @@ class layout_left::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
   _LIBCPP_HIDE_FROM_ABI constexpr mapping(const mapping&) noexcept = default;
   _LIBCPP_HIDE_FROM_ABI constexpr mapping(const extents_type& __ext) noexcept : __extents_(__ext) {
-    _LIBCPP_ASSERT(__required_span_size_is_representable(__ext),
-                   "layout_left::mapping extents ctor: product of extents must be representable as index_type.");
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // mapping<dextents<char, 2>> map(dextents<char, 2>(40,40)); map(10, 3) == -126
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __required_span_size_is_representable(__ext),
+        "layout_left::mapping extents ctor: product of extents must be representable as index_type.");
   }
 
   template <class _OtherExtents>
@@ -84,7 +87,9 @@ class layout_left::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherExtents, extents_type>)
       mapping(const mapping<_OtherExtents>& __other) noexcept
       : __extents_(__other.extents()) {
-    _LIBCPP_ASSERT(
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // mapping<dextents<char, 2>> map(mapping<dextents<int, 2>>(dextents<int, 2>(40,40))); map(10, 3) == -126
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __mdspan_detail::__is_representable_as<index_type>(__other.required_span_size()),
         "layout_left::mapping converting ctor: other.required_span_size() must be representable as index_type.");
   }
@@ -94,7 +99,13 @@ class layout_left::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherExtents, extents_type>)
       mapping(const layout_right::mapping<_OtherExtents>& __other) noexcept
       : __extents_(__other.extents()) {
-    _LIBCPP_ASSERT(
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // Note: since this is constraint to rank 1, extents itself would catch the invalid conversion first
+    //       and thus this assertion should never be triggered, but keeping it here for consistency
+    // layout_left::mapping<dextents<char, 1>> map(
+    //           layout_right::mapping<dextents<unsigned, 1>>(dextents<unsigned, 1>(200))); map.extents().extent(0) ==
+    //           -56
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __mdspan_detail::__is_representable_as<index_type>(__other.required_span_size()),
         "layout_left::mapping converting ctor: other.required_span_size() must be representable as index_type.");
   }
@@ -122,6 +133,10 @@ class layout_left::mapping {
     requires((sizeof...(_Indices) == extents_type::rank()) && (is_convertible_v<_Indices, index_type> && ...) &&
              (is_nothrow_constructible_v<index_type, _Indices> && ...))
   _LIBCPP_HIDE_FROM_ABI constexpr index_type operator()(_Indices... __idx) const noexcept {
+    // Mappings are generally meant to be used for accessing allocations and are meant to guarantee to never
+    // return a value exceeding required_span_size(), which is used to know how large an allocation one needs
+    // Thus, this is a canonical point in multi-dimensional data structures to make invalid element access checks
+    // However, mdspan does check this on its own, so for now we avoid double checking in hardened mode
     _LIBCPP_ASSERT(__mdspan_detail::__is_multidimensional_index_in(__extents_, __idx...),
                    "layout_left::mapping: out of bounds indexing");
     array<index_type, extents_type::rank()> __idx_a{static_cast<index_type>(__idx)...};
@@ -144,7 +159,10 @@ class layout_left::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr index_type stride(rank_type __r) const noexcept
     requires(extents_type::rank() > 0)
   {
-    _LIBCPP_ASSERT(__r < extents_type::rank(), "layout_left::mapping::stride(): invalid rank index");
+    // While it would be caught by extents itself too, using a too large __r
+    // is functionally an out of bounds access on the stored information needed to compute strides
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __r < extents_type::rank(), "layout_left::mapping::stride(): invalid rank index");
     index_type __s = 1;
     for (rank_type __i = extents_type::rank() - 1; __i > __r; __i--)
       __s *= __extents_.extent(__i);
@@ -159,7 +177,7 @@ class layout_left::mapping {
   }
 
 private:
-  extents_type __extents_{}; // exposition only
+  _LIBCPP_NO_UNIQUE_ADDRESS extents_type __extents_{};
 };
 
 #endif // _LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/__mdspan/layout_right.h b/libcxx/include/__mdspan/layout_right.h
index a8a91b86c714..3d814554a1be 100644
--- a/libcxx/include/__mdspan/layout_right.h
+++ b/libcxx/include/__mdspan/layout_right.h
@@ -74,8 +74,11 @@ class layout_right::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr mapping() noexcept               = default;
   _LIBCPP_HIDE_FROM_ABI constexpr mapping(const mapping&) noexcept = default;
   _LIBCPP_HIDE_FROM_ABI constexpr mapping(const extents_type& __ext) noexcept : __extents_(__ext) {
-    _LIBCPP_ASSERT(__required_span_size_is_representable(__ext),
-                   "layout_right::mapping extents ctor: product of extents must be representable as index_type.");
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // mapping<dextents<char, 2>> map(dextents<char, 2>(40,40)); map(3, 10) == -126
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __required_span_size_is_representable(__ext),
+        "layout_right::mapping extents ctor: product of extents must be representable as index_type.");
   }
 
   template <class _OtherExtents>
@@ -83,7 +86,9 @@ class layout_right::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherExtents, extents_type>)
       mapping(const mapping<_OtherExtents>& __other) noexcept
       : __extents_(__other.extents()) {
-    _LIBCPP_ASSERT(
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // mapping<dextents<char, 2>> map(mapping<dextents<int, 2>>(dextents<int, 2>(40,40))); map(3, 10) == -126
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __mdspan_detail::__is_representable_as<index_type>(__other.required_span_size()),
         "layout_right::mapping converting ctor: other.required_span_size() must be representable as index_type.");
   }
@@ -93,7 +98,13 @@ class layout_right::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherExtents, extents_type>)
       mapping(const layout_left::mapping<_OtherExtents>& __other) noexcept
       : __extents_(__other.extents()) {
-    _LIBCPP_ASSERT(
+    // not catching this could lead to out-of-bounds access later when used inside mdspan
+    // Note: since this is constraint to rank 1, extents itself would catch the invalid conversion first
+    //       and thus this assertion should never be triggered, but keeping it here for consistency
+    // layout_right::mapping<dextents<char, 1>> map(
+    //           layout_left::mapping<dextents<unsigned, 1>>(dextents<unsigned, 1>(200))); map.extents().extent(0) ==
+    //           -56
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __mdspan_detail::__is_representable_as<index_type>(__other.required_span_size()),
         "layout_right::mapping converting ctor: other.required_span_size() must be representable as index_type.");
   }
@@ -121,6 +132,10 @@ class layout_right::mapping {
     requires((sizeof...(_Indices) == extents_type::rank()) && (is_convertible_v<_Indices, index_type> && ...) &&
              (is_nothrow_constructible_v<index_type, _Indices> && ...))
   _LIBCPP_HIDE_FROM_ABI constexpr index_type operator()(_Indices... __idx) const noexcept {
+    // Mappings are generally meant to be used for accessing allocations and are meant to guarantee to never
+    // return a value exceeding required_span_size(), which is used to know how large an allocation one needs
+    // Thus, this is a canonical point in multi-dimensional data structures to make invalid element access checks
+    // However, mdspan does check this on its own, so for now we avoid double checking in hardened mode
     _LIBCPP_ASSERT(__mdspan_detail::__is_multidimensional_index_in(__extents_, __idx...),
                    "layout_right::mapping: out of bounds indexing");
     return [&]<size_t... _Pos>(index_sequence<_Pos...>) {
@@ -141,7 +156,10 @@ class layout_right::mapping {
   _LIBCPP_HIDE_FROM_ABI constexpr index_type stride(rank_type __r) const noexcept
     requires(extents_type::rank() > 0)
   {
-    _LIBCPP_ASSERT(__r < extents_type::rank(), "layout_right::mapping::stride(): invalid rank index");
+    // While it would be caught by extents itself too, using a too large __r
+    // is functionally an out of bounds access on the stored information needed to compute strides
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __r < extents_type::rank(), "layout_right::mapping::stride(): invalid rank index");
     index_type __s = 1;
     for (rank_type __i = extents_type::rank() - 1; __i > __r; __i--)
       __s *= __extents_.extent(__i);
@@ -156,7 +174,7 @@ class layout_right::mapping {
   }
 
 private:
-  extents_type __extents_{}; // exposition only
+  _LIBCPP_NO_UNIQUE_ADDRESS extents_type __extents_{};
 };
 
 #endif // _LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/__mdspan/mdspan.h b/libcxx/include/__mdspan/mdspan.h
new file mode 100644
index 000000000000..58f3b9cf1b18
--- /dev/null
+++ b/libcxx/include/__mdspan/mdspan.h
@@ -0,0 +1,308 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___MDSPAN_MDSPAN_H
+#define _LIBCPP___MDSPAN_MDSPAN_H
+
+#include <__assert>
+#include <__config>
+#include <__fwd/mdspan.h>
+#include <__mdspan/default_accessor.h>
+#include <__mdspan/extents.h>
+#include <__type_traits/extent.h>
+#include <__type_traits/is_abstract.h>
+#include <__type_traits/is_array.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_convertible.h>
+#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_pointer.h>
+#include <__type_traits/is_same.h>
+#include <__type_traits/rank.h>
+#include <__type_traits/remove_all_extents.h>
+#include <__type_traits/remove_cv.h>
+#include <__type_traits/remove_pointer.h>
+#include <__type_traits/remove_reference.h>
+#include <__utility/integer_sequence.h>
+#include <array>
+#include <cinttypes>
+#include <cstddef>
+#include <limits>
+#include <span>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+// Helper for lightweight test checking that one did pass a layout policy as LayoutPolicy template argument
+namespace __mdspan_detail {
+template <class _Layout, class _Extents>
+concept __has_invalid_mapping = !requires { typename _Layout::template mapping<_Extents>; };
+} // namespace __mdspan_detail
+
+template <class _ElementType,
+          class _Extents,
+          class _LayoutPolicy   = layout_right,
+          class _AccessorPolicy = default_accessor<_ElementType> >
+class mdspan {
+private:
+  static_assert(__mdspan_detail::__is_extents_v<_Extents>,
+                "mdspan: Extents template parameter must be a specialization of extents.");
+  static_assert(!is_array_v<_ElementType>, "mdspan: ElementType template parameter may not be an array type");
+  static_assert(!is_abstract_v<_ElementType>, "mdspan: ElementType template parameter may not be an abstract class");
+  static_assert(is_same_v<_ElementType, typename _AccessorPolicy::element_type>,
+                "mdspan: ElementType template parameter must match AccessorPolicy::element_type");
+  static_assert(!__mdspan_detail::__has_invalid_mapping<_LayoutPolicy, _Extents>,
+                "mdspan: LayoutPolicy template parameter is invalid. A common mistake is to pass a layout mapping "
+                "instead of a layout policy");
+
+public:
+  using extents_type     = _Extents;
+  using layout_type      = _LayoutPolicy;
+  using accessor_type    = _AccessorPolicy;
+  using mapping_type     = typename layout_type::template mapping<extents_type>;
+  using element_type     = _ElementType;
+  using value_type       = remove_cv_t<element_type>;
+  using index_type       = typename extents_type::index_type;
+  using size_type        = typename extents_type::size_type;
+  using rank_type        = typename extents_type::rank_type;
+  using data_handle_type = typename accessor_type::data_handle_type;
+  using reference        = typename accessor_type::reference;
+
+  _LIBCPP_HIDE_FROM_ABI static constexpr rank_type rank() noexcept { return extents_type::rank(); }
+  _LIBCPP_HIDE_FROM_ABI static constexpr rank_type rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
+  _LIBCPP_HIDE_FROM_ABI static constexpr size_t static_extent(rank_type __r) noexcept {
+    return extents_type::static_extent(__r);
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr index_type extent(rank_type __r) const noexcept {
+    return __map_.extents().extent(__r);
+  };
+
+public:
+  //--------------------------------------------------------------------------------
+  // [mdspan.mdspan.cons], mdspan constructors, assignment, and destructor
+
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan()
+    requires((extents_type::rank_dynamic() > 0) && is_default_constructible_v<data_handle_type> &&
+             is_default_constructible_v<mapping_type> && is_default_constructible_v<accessor_type>)
+  = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan(const mdspan&) = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan(mdspan&&)      = default;
+
+  template <class... _OtherIndexTypes>
+    requires((is_convertible_v<_OtherIndexTypes, index_type> && ...) &&
+             (is_nothrow_constructible_v<index_type, _OtherIndexTypes> && ...) &&
+             ((sizeof...(_OtherIndexTypes) == rank()) || (sizeof...(_OtherIndexTypes) == rank_dynamic())) &&
+             is_constructible_v<mapping_type, extents_type> && is_default_constructible_v<accessor_type>)
+  _LIBCPP_HIDE_FROM_ABI explicit constexpr mdspan(data_handle_type __p, _OtherIndexTypes... __exts)
+      : __ptr_(std::move(__p)), __map_(extents_type(static_cast<index_type>(std::move(__exts))...)), __acc_{} {}
+
+  template <class _OtherIndexType, size_t _Size>
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&> &&
+             ((_Size == rank()) || (_Size == rank_dynamic())) && is_constructible_v<mapping_type, extents_type> &&
+             is_default_constructible_v<accessor_type>)
+  explicit(_Size != rank_dynamic())
+      _LIBCPP_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const array<_OtherIndexType, _Size>& __exts)
+      : __ptr_(std::move(__p)), __map_(extents_type(__exts)), __acc_{} {}
+
+  template <class _OtherIndexType, size_t _Size>
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&> &&
+             ((_Size == rank()) || (_Size == rank_dynamic())) && is_constructible_v<mapping_type, extents_type> &&
+             is_default_constructible_v<accessor_type>)
+  explicit(_Size != rank_dynamic())
+      _LIBCPP_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, span<_OtherIndexType, _Size> __exts)
+      : __ptr_(std::move(__p)), __map_(extents_type(__exts)), __acc_{} {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const extents_type& __exts)
+    requires(is_default_constructible_v<accessor_type> && is_constructible_v<mapping_type, const extents_type&>)
+      : __ptr_(std::move(__p)), __map_(__exts), __acc_{} {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const mapping_type& __m)
+    requires(is_default_constructible_v<accessor_type>)
+      : __ptr_(std::move(__p)), __map_(__m), __acc_{} {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan(data_handle_type __p, const mapping_type& __m, const accessor_type& __a)
+      : __ptr_(std::move(__p)), __map_(__m), __acc_(__a) {}
+
+  template <class _OtherElementType, class _OtherExtents, class _OtherLayoutPolicy, class _OtherAccessor>
+    requires(is_constructible_v<mapping_type, const typename _OtherLayoutPolicy::template mapping<_OtherExtents>&> &&
+             is_constructible_v<accessor_type, const _OtherAccessor&>)
+  explicit(!is_convertible_v<const typename _OtherLayoutPolicy::template mapping<_OtherExtents>&, mapping_type> ||
+           !is_convertible_v<const _OtherAccessor&, accessor_type>)
+      _LIBCPP_HIDE_FROM_ABI constexpr mdspan(
+          const mdspan<_OtherElementType, _OtherExtents, _OtherLayoutPolicy, _OtherAccessor>& __other)
+      : __ptr_(__other.__ptr_), __map_(__other.__map_), __acc_(__other.__acc_) {
+    static_assert(is_constructible_v<data_handle_type, const typename _OtherAccessor::data_handle_type&>,
+                  "mdspan: incompatible data_handle_type for mdspan construction");
+    static_assert(
+        is_constructible_v<extents_type, _OtherExtents>, "mdspan: incompatible extents for mdspan construction");
+
+    // The following precondition is part of the standard, but is unlikely to be triggered.
+    // The extents constructor checks this and the mapping must be storing the extents, since
+    // its extents() function returns a const reference to extents_type.
+    // The only way this can be triggered is if the mapping conversion constructor would for example
+    // always construct its extents() only from the dynamic extents, instead of from the other extents.
+    if constexpr (rank() > 0) {
+      for (size_t __r = 0; __r < rank(); __r++) {
+        // Not catching this could lead to out of bounds errors later
+        // e.g. mdspan<int, dextents<char,1>, non_checking_layout> m =
+        //        mdspan<int, dextents<unsigned, 1>, non_checking_layout>(ptr, 200); leads to an extent of -56 on m
+        _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+            (static_extent(__r) == dynamic_extent) ||
+                (static_cast<index_type>(__other.extent(__r)) == static_cast<index_type>(static_extent(__r))),
+            "mdspan: conversion mismatch of source dynamic extents with static extents");
+      }
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan& operator=(const mdspan&) = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr mdspan& operator=(mdspan&&)      = default;
+
+  //--------------------------------------------------------------------------------
+  // [mdspan.mdspan.members], members
+
+  template <class... _OtherIndexTypes>
+    requires((is_convertible_v<_OtherIndexTypes, index_type> && ...) &&
+             (is_nothrow_constructible_v<index_type, _OtherIndexTypes> && ...) &&
+             (sizeof...(_OtherIndexTypes) == rank()))
+  _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](_OtherIndexTypes... __indices) const {
+    // Note the standard layouts would also check this, but user provided ones may not, so we
+    // check the precondition here
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__mdspan_detail::__is_multidimensional_index_in(extents(), __indices...),
+                                        "mdspan: operator[] out of bounds access");
+    return __acc_.access(__ptr_, __map_(static_cast<index_type>(std::move(__indices))...));
+  }
+
+  template <class _OtherIndexType>
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&>)
+  _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](const array< _OtherIndexType, rank()>& __indices) const {
+    return __acc_.access(__ptr_, [&]<size_t... _Idxs>(index_sequence<_Idxs...>) {
+      return __map_(__indices[_Idxs]...);
+    }(make_index_sequence<rank()>()));
+  }
+
+  template <class _OtherIndexType>
+    requires(is_convertible_v<const _OtherIndexType&, index_type> &&
+             is_nothrow_constructible_v<index_type, const _OtherIndexType&>)
+  _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](span<_OtherIndexType, rank()> __indices) const {
+    return __acc_.access(__ptr_, [&]<size_t... _Idxs>(index_sequence<_Idxs...>) {
+      return __map_(__indices[_Idxs]...);
+    }(make_index_sequence<rank()>()));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr size_type size() const noexcept {
+    // Could leave this as only checked in debug mode: semantically size() is never
+    // guaranteed to be related to any accessible range
+    _LIBCPP_ASSERT_UNCATEGORIZED(
+        false == ([&]<size_t... _Idxs>(index_sequence<_Idxs...>) {
+          size_type __prod = 1;
+          return (__builtin_mul_overflow(__prod, extent(_Idxs), &__prod) || ... || false);
+        }(make_index_sequence<rank()>())),
+        "mdspan: size() is not representable as size_type");
+    return [&]<size_t... _Idxs>(index_sequence<_Idxs...>) {
+      return ((static_cast<size_type>(__map_.extents().extent(_Idxs))) * ... * size_type(1));
+    }(make_index_sequence<rank()>());
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const noexcept {
+    return [&]<size_t... _Idxs>(index_sequence<_Idxs...>) {
+      return (rank() > 0) && ((__map_.extents().extent(_Idxs) == index_type(0)) || ... || false);
+    }(make_index_sequence<rank()>());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr void swap(mdspan& __x, mdspan& __y) noexcept {
+    swap(__x.__ptr_, __y.__ptr_);
+    swap(__x.__map_, __y.__map_);
+    swap(__x.__acc_, __y.__acc_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr const extents_type& extents() const noexcept { return __map_.extents(); };
+  _LIBCPP_HIDE_FROM_ABI constexpr const data_handle_type& data_handle() const noexcept { return __ptr_; };
+  _LIBCPP_HIDE_FROM_ABI constexpr const mapping_type& mapping() const noexcept { return __map_; };
+  _LIBCPP_HIDE_FROM_ABI constexpr const accessor_type& accessor() const noexcept { return __acc_; };
+
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); };
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); };
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); };
+
+  _LIBCPP_HIDE_FROM_ABI constexpr bool is_unique() const { return __map_.is_unique(); };
+  _LIBCPP_HIDE_FROM_ABI constexpr bool is_exhaustive() const { return __map_.is_exhaustive(); };
+  _LIBCPP_HIDE_FROM_ABI constexpr bool is_strided() const { return __map_.is_strided(); };
+  _LIBCPP_HIDE_FROM_ABI constexpr index_type stride(rank_type __r) const { return __map_.stride(__r); };
+
+private:
+  _LIBCPP_NO_UNIQUE_ADDRESS data_handle_type __ptr_{};
+  _LIBCPP_NO_UNIQUE_ADDRESS mapping_type __map_{};
+  _LIBCPP_NO_UNIQUE_ADDRESS accessor_type __acc_{};
+
+  template <class, class, class, class>
+  friend class mdspan;
+};
+
+template <class _ElementType, class... _OtherIndexTypes>
+  requires((is_convertible_v<_OtherIndexTypes, size_t> && ...) && (sizeof...(_OtherIndexTypes) > 0))
+explicit mdspan(_ElementType*, _OtherIndexTypes...)
+    -> mdspan<_ElementType, dextents<size_t, sizeof...(_OtherIndexTypes)>>;
+
+template <class _Pointer>
+  requires(is_pointer_v<remove_reference_t<_Pointer>>)
+mdspan(_Pointer&&) -> mdspan<remove_pointer_t<remove_reference_t<_Pointer>>, extents<size_t>>;
+
+template <class _CArray>
+  requires(is_array_v<_CArray> && (rank_v<_CArray> == 1))
+mdspan(_CArray&) -> mdspan<remove_all_extents_t<_CArray>, extents<size_t, extent_v<_CArray, 0>>>;
+
+template <class _ElementType, class _OtherIndexType, size_t _Size>
+mdspan(_ElementType*, const array<_OtherIndexType, _Size>&) -> mdspan<_ElementType, dextents<size_t, _Size>>;
+
+template <class _ElementType, class _OtherIndexType, size_t _Size>
+mdspan(_ElementType*, span<_OtherIndexType, _Size>) -> mdspan<_ElementType, dextents<size_t, _Size>>;
+
+// This one is necessary because all the constructors take `data_handle_type`s, not
+// `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
+// seems to throw off automatic deduction guides.
+template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
+mdspan(_ElementType*, const extents<_OtherIndexType, _ExtentsPack...>&)
+    -> mdspan<_ElementType, extents<_OtherIndexType, _ExtentsPack...>>;
+
+template <class _ElementType, class _MappingType>
+mdspan(_ElementType*, const _MappingType&)
+    -> mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
+
+template <class _MappingType, class _AccessorType>
+mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
+    -> mdspan<typename _AccessorType::element_type,
+              typename _MappingType::extents_type,
+              typename _MappingType::layout_type,
+              _AccessorType>;
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___MDSPAN_MDSPAN_H
diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan
index 701def50b40a..9082eb8bdb55 100644
--- a/libcxx/include/mdspan
+++ b/libcxx/include/mdspan
@@ -208,6 +208,135 @@ namespace std {
   };
 }
 
+// mdspan synopsis
+
+namespace std {
+  template<class ElementType, class Extents, class LayoutPolicy = layout_right,
+           class AccessorPolicy = default_accessor<ElementType>>
+  class mdspan {
+  public:
+    using extents_type = Extents;
+    using layout_type = LayoutPolicy;
+    using accessor_type = AccessorPolicy;
+    using mapping_type = typename layout_type::template mapping<extents_type>;
+    using element_type = ElementType;
+    using value_type = remove_cv_t<element_type>;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using data_handle_type = typename accessor_type::data_handle_type;
+    using reference = typename accessor_type::reference;
+
+    static constexpr rank_type rank() noexcept { return extents_type::rank(); }
+    static constexpr rank_type rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
+    static constexpr size_t static_extent(rank_type r) noexcept
+      { return extents_type::static_extent(r); }
+    constexpr index_type extent(rank_type r) const noexcept { return extents().extent(r); }
+
+    // [mdspan.mdspan.cons], constructors
+    constexpr mdspan();
+    constexpr mdspan(const mdspan& rhs) = default;
+    constexpr mdspan(mdspan&& rhs) = default;
+
+    template<class... OtherIndexTypes>
+      constexpr explicit mdspan(data_handle_type ptr, OtherIndexTypes... exts);
+    template<class OtherIndexType, size_t N>
+      constexpr explicit(N != rank_dynamic())
+        mdspan(data_handle_type p, span<OtherIndexType, N> exts);
+    template<class OtherIndexType, size_t N>
+      constexpr explicit(N != rank_dynamic())
+        mdspan(data_handle_type p, const array<OtherIndexType, N>& exts);
+    constexpr mdspan(data_handle_type p, const extents_type& ext);
+    constexpr mdspan(data_handle_type p, const mapping_type& m);
+    constexpr mdspan(data_handle_type p, const mapping_type& m, const accessor_type& a);
+
+    template<class OtherElementType, class OtherExtents,
+             class OtherLayoutPolicy, class OtherAccessorPolicy>
+      constexpr explicit(see below)
+        mdspan(const mdspan<OtherElementType, OtherExtents,
+                            OtherLayoutPolicy, OtherAccessorPolicy>& other);
+
+    constexpr mdspan& operator=(const mdspan& rhs) = default;
+    constexpr mdspan& operator=(mdspan&& rhs) = default;
+
+    // [mdspan.mdspan.members], members
+    template<class... OtherIndexTypes>
+      constexpr reference operator[](OtherIndexTypes... indices) const;
+    template<class OtherIndexType>
+      constexpr reference operator[](span<OtherIndexType, rank()> indices) const;
+    template<class OtherIndexType>
+      constexpr reference operator[](const array<OtherIndexType, rank()>& indices) const;
+
+    constexpr size_type size() const noexcept;
+    [[nodiscard]] constexpr bool empty() const noexcept;
+
+    friend constexpr void swap(mdspan& x, mdspan& y) noexcept;
+
+    constexpr const extents_type& extents() const noexcept { return map_.extents(); }
+    constexpr const data_handle_type& data_handle() const noexcept { return ptr_; }
+    constexpr const mapping_type& mapping() const noexcept { return map_; }
+    constexpr const accessor_type& accessor() const noexcept { return acc_; }
+
+    static constexpr bool is_always_unique()
+      { return mapping_type::is_always_unique(); }
+    static constexpr bool is_always_exhaustive()
+      { return mapping_type::is_always_exhaustive(); }
+    static constexpr bool is_always_strided()
+      { return mapping_type::is_always_strided(); }
+
+    constexpr bool is_unique() const
+      { return map_.is_unique(); }
+    constexpr bool is_exhaustive() const
+      { return map_.is_exhaustive(); }
+    constexpr bool is_strided() const
+      { return map_.is_strided(); }
+    constexpr index_type stride(rank_type r) const
+      { return map_.stride(r); }
+
+  private:
+    accessor_type acc_;         // exposition only
+    mapping_type map_;          // exposition only
+    data_handle_type ptr_;      // exposition only
+  };
+
+  template<class CArray>
+    requires(is_array_v<CArray> && rank_v<CArray> == 1)
+    mdspan(CArray&)
+      -> mdspan<remove_all_extents_t<CArray>, extents<size_t, extent_v<CArray, 0>>>;
+
+  template<class Pointer>
+    requires(is_pointer_v<remove_reference_t<Pointer>>)
+    mdspan(Pointer&&)
+      -> mdspan<remove_pointer_t<remove_reference_t<Pointer>>, extents<size_t>>;
+
+  template<class ElementType, class... Integrals>
+    requires((is_convertible_v<Integrals, size_t> && ...) && sizeof...(Integrals) > 0)
+    explicit mdspan(ElementType*, Integrals...)
+      -> mdspan<ElementType, dextents<size_t, sizeof...(Integrals)>>;
+
+  template<class ElementType, class OtherIndexType, size_t N>
+    mdspan(ElementType*, span<OtherIndexType, N>)
+      -> mdspan<ElementType, dextents<size_t, N>>;
+
+  template<class ElementType, class OtherIndexType, size_t N>
+    mdspan(ElementType*, const array<OtherIndexType, N>&)
+      -> mdspan<ElementType, dextents<size_t, N>>;
+
+  template<class ElementType, class IndexType, size_t... ExtentsPack>
+    mdspan(ElementType*, const extents<IndexType, ExtentsPack...>&)
+      -> mdspan<ElementType, extents<IndexType, ExtentsPack...>>;
+
+  template<class ElementType, class MappingType>
+    mdspan(ElementType*, const MappingType&)
+      -> mdspan<ElementType, typename MappingType::extents_type,
+                typename MappingType::layout_type>;
+
+  template<class MappingType, class AccessorType>
+    mdspan(const typename AccessorType::data_handle_type&, const MappingType&,
+           const AccessorType&)
+      -> mdspan<typename AccessorType::element_type, typename MappingType::extents_type,
+                typename MappingType::layout_type, AccessorType>;
+}
 */
 
 #ifndef _LIBCPP_MDSPAN
@@ -219,6 +348,7 @@ namespace std {
 #include <__mdspan/extents.h>
 #include <__mdspan/layout_left.h>
 #include <__mdspan/layout_right.h>
+#include <__mdspan/mdspan.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 9ff8b67a6a20..0b418d2b7897 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1516,6 +1516,7 @@ module std_private_mdspan_extents          [system] {
 }
 module std_private_mdspan_layout_left      [system] { header "__mdspan/layout_left.h" }
 module std_private_mdspan_layout_right     [system] { header "__mdspan/layout_right.h" }
+module std_private_mdspan_mdspan           [system] { header "__mdspan/mdspan.h" }
 module std_private_mdspan_mdspan_fwd       [system] { header "__fwd/mdspan.h" }
 
 module std_private_memory_addressof                       [system] { header "__memory/addressof.h" }
diff --git a/libcxx/modules/std/mdspan.cppm b/libcxx/modules/std/mdspan.cppm
index 5023dfb925ea..d92024d9a77a 100644
--- a/libcxx/modules/std/mdspan.cppm
+++ b/libcxx/modules/std/mdspan.cppm
@@ -27,5 +27,5 @@ export namespace std {
   using std::default_accessor;
 
   // [mdspan.mdspan], class template mdspan
-  // using std::mdspan;
+  using std::mdspan;
 } // namespace std
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 2aa65ec070ec..82ab064211d7 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -66,9 +66,16 @@ struct ArchInfo {
 
 bool isValidArchName(StringRef Arch);
 bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
+bool isValidTuneCPUName(StringRef TuneCPU);
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+StringRef getDefaultArch(bool Is64Bit);
+void setArch(StringRef Arch);
+StringRef getArch();
+void setTuneCPU(StringRef TuneCPU);
+StringRef getTuneCPU();
 
 } // namespace LoongArch
 
 } // namespace llvm
 
-#endif // LLVM_SUPPORT_LOONGARCHTARGETPARSER_H
+#endif // LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index 4e78d9db024c..f780385f7f67 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -126,15 +126,6 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   SCCPSolver &Solver;
 
   ConstMap KnownConstants;
-  // Basic blocks known to be unreachable after constant propagation.
-  DenseSet<BasicBlock *> DeadBlocks;
-  // PHI nodes we have visited before.
-  DenseSet<Instruction *> VisitedPHIs;
-  // PHI nodes we have visited once without successfully constant folding them.
-  // Once the InstCostVisitor has processed all the specialization arguments,
-  // it should be possible to determine whether those PHIs can be folded
-  // (some of their incoming values may have become constant or dead).
-  SmallVector<Instruction *> PendingPHIs;
 
   ConstMap::iterator LastVisited;
 
@@ -143,10 +134,7 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
                   TargetTransformInfo &TTI, SCCPSolver &Solver)
       : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {}
 
-  Cost getUserBonus(Instruction *User, Value *Use = nullptr,
-                    Constant *C = nullptr);
-
-  Cost getBonusFromPendingPHIs();
+  Cost getUserBonus(Instruction *User, Value *Use, Constant *C);
 
 private:
   friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -155,7 +143,6 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   Cost estimateBranchInst(BranchInst &I);
 
   Constant *visitInstruction(Instruction &I) { return nullptr; }
-  Constant *visitPHINode(PHINode &I);
   Constant *visitFreezeInst(FreezeInst &I);
   Constant *visitCallBase(CallBase &I);
   Constant *visitLoadInst(LoadInst &I);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 3994552884c4..647f570ab807 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2628,12 +2628,12 @@ MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
   // function entry point csect instead. And for function delcarations, the
   // undefined symbols gets treated as csect with XTY_ER property.
   if (((TM.getFunctionSections() && !Func->hasSection()) ||
-       Func->isDeclaration()) &&
+       Func->isDeclarationForLinker()) &&
       isa<Function>(Func)) {
     return getContext()
         .getXCOFFSection(
             NameStr, SectionKind::getText(),
-            XCOFF::CsectProperties(XCOFF::XMC_PR, Func->isDeclaration()
+            XCOFF::CsectProperties(XCOFF::XMC_PR, Func->isDeclarationForLinker()
                                                       ? XCOFF::XTY_ER
                                                       : XCOFF::XTY_SD))
         ->getQualNameSymbol();
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 7241a5d63526..0675caa3b601 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -117,6 +117,11 @@ include "LoongArchInstrInfo.td"
 def : ProcessorModel<"generic-la32", NoSchedModel, [Feature32Bit]>;
 def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit, FeatureUAL]>;
 
+// Generic 64-bit processor with double-precision floating-point support.
+def : ProcessorModel<"loongarch64", NoSchedModel, [Feature64Bit,
+                                                   FeatureUAL,
+                                                   FeatureBasicD]>;
+
 // Support generic for compatibility with other targets. The triple will be used
 // to change to the appropriate la32/la64 version.
 def : ProcessorModel<"generic", NoSchedModel, []>;
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 18b04600dbc6..72781513ff12 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -16,6 +16,9 @@
 using namespace llvm;
 using namespace llvm::LoongArch;
 
+StringRef Arch;
+StringRef TuneCPU;
+
 const FeatureInfo AllFeatures[] = {
 #define LOONGARCH_FEATURE(NAME, KIND) {NAME, KIND},
 #include "llvm/TargetParser/LoongArchTargetParser.def"
@@ -46,3 +49,25 @@ bool LoongArch::getArchFeatures(StringRef Arch,
   }
   return false;
 }
+
+bool LoongArch::isValidTuneCPUName(StringRef TuneCPU) {
+  return isValidArchName(TuneCPU);
+}
+
+void LoongArch::fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto A : AllArchs)
+    Values.emplace_back(A.Name);
+}
+
+StringRef LoongArch::getDefaultArch(bool Is64Bit) {
+  // TODO: use a real 32-bit arch name.
+  return Is64Bit ? "loongarch64" : "";
+}
+
+void LoongArch::setArch(StringRef Name) { Arch = Name; }
+
+StringRef LoongArch::getArch() { return Arch; }
+
+void LoongArch::setTuneCPU(StringRef Name) { TuneCPU = Name; }
+
+StringRef LoongArch::getTuneCPU() { return TuneCPU; }
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 3d6c501e4596..ac5dbc7cfb2a 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -78,11 +78,6 @@ static cl::opt<unsigned> MaxClones(
     "The maximum number of clones allowed for a single function "
     "specialization"));
 
-static cl::opt<unsigned> MaxIncomingPhiValues(
-    "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc(
-    "The maximum number of incoming values a PHI node can have to be "
-    "considered during the specialization bonus estimation"));
-
 static cl::opt<unsigned> MinFunctionSize(
     "funcspec-min-function-size", cl::init(100), cl::Hidden, cl::desc(
     "Don't specialize functions that have less than this number of "
@@ -109,7 +104,6 @@ static cl::opt<bool> SpecializeLiteralConstant(
 // the combination of size and latency savings in comparison to the non
 // specialized version of the function.
 static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
-                                DenseSet<BasicBlock *> &DeadBlocks,
                                 ConstMap &KnownConstants, SCCPSolver &Solver,
                                 BlockFrequencyInfo &BFI,
                                 TargetTransformInfo &TTI) {
@@ -124,12 +118,6 @@ static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
     if (!Weight)
       continue;
 
-    // These blocks are considered dead as far as the InstCostVisitor is
-    // concerned. They haven't been proven dead yet by the Solver, but
-    // may become if we propagate the constant specialization arguments.
-    if (!DeadBlocks.insert(BB).second)
-      continue;
-
     for (Instruction &I : *BB) {
       // Disregard SSA copies.
       if (auto *II = dyn_cast<IntrinsicInst>(&I))
@@ -164,19 +152,9 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
   return nullptr;
 }
 
-Cost InstCostVisitor::getBonusFromPendingPHIs() {
-  Cost Bonus = 0;
-  while (!PendingPHIs.empty()) {
-    Instruction *Phi = PendingPHIs.pop_back_val();
-    Bonus += getUserBonus(Phi);
-  }
-  return Bonus;
-}
-
 Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
   // Cache the iterator before visiting.
-  LastVisited = Use ? KnownConstants.insert({Use, C}).first
-                    : KnownConstants.end();
+  LastVisited = KnownConstants.insert({Use, C}).first;
 
   if (auto *I = dyn_cast<SwitchInst>(User))
     return estimateSwitchInst(*I);
@@ -203,15 +181,13 @@ Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
 
   for (auto *U : User->users())
     if (auto *UI = dyn_cast<Instruction>(U))
-      if (UI != User && Solver.isBlockExecutable(UI->getParent()))
+      if (Solver.isBlockExecutable(UI->getParent()))
         Bonus += getUserBonus(UI, User, C);
 
   return Bonus;
 }
 
 Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return 0;
 
@@ -232,13 +208,10 @@ Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
     WorkList.push_back(BB);
   }
 
-  return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
-                             TTI);
+  return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI);
 }
 
 Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return 0;
 
@@ -250,39 +223,10 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
       Succ->getUniquePredecessor() == I.getParent())
     WorkList.push_back(Succ);
 
-  return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
-                             TTI);
-}
-
-Constant *InstCostVisitor::visitPHINode(PHINode &I) {
-  if (I.getNumIncomingValues() > MaxIncomingPhiValues)
-    return nullptr;
-
-  bool Inserted = VisitedPHIs.insert(&I).second;
-  Constant *Const = nullptr;
-
-  for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) {
-    Value *V = I.getIncomingValue(Idx);
-    if (auto *Inst = dyn_cast<Instruction>(V))
-      if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx)))
-        continue;
-    Constant *C = findConstantFor(V, KnownConstants);
-    if (!C) {
-      if (Inserted)
-        PendingPHIs.push_back(&I);
-      return nullptr;
-    }
-    if (!Const)
-      Const = C;
-    else if (C != Const)
-      return nullptr;
-  }
-  return Const;
+  return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI);
 }
 
 Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (isGuaranteedNotToBeUndefOrPoison(LastVisited->second))
     return LastVisited->second;
   return nullptr;
@@ -309,8 +253,6 @@ Constant *InstCostVisitor::visitCallBase(CallBase &I) {
 }
 
 Constant *InstCostVisitor::visitLoadInst(LoadInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (isa<ConstantPointerNull>(LastVisited->second))
     return nullptr;
   return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL);
@@ -333,8 +275,6 @@ Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
 }
 
 Constant *InstCostVisitor::visitSelectInst(SelectInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   if (I.getCondition() != LastVisited->first)
     return nullptr;
 
@@ -350,8 +290,6 @@ Constant *InstCostVisitor::visitCastInst(CastInst &I) {
 }
 
 Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   bool Swap = I.getOperand(1) == LastVisited->first;
   Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
   Constant *Other = findConstantFor(V, KnownConstants);
@@ -365,14 +303,10 @@ Constant *InstCostVisitor::visitCmpInst(CmpInst &I) {
 }
 
 Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL);
 }
 
 Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) {
-  assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
-
   bool Swap = I.getOperand(1) == LastVisited->first;
   Value *V = Swap ? I.getOperand(0) : I.getOperand(1);
   Constant *Other = findConstantFor(V, KnownConstants);
@@ -779,17 +713,13 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost,
       AllSpecs[Index].CallSites.push_back(&CS);
     } else {
       // Calculate the specialisation gain.
-      Cost Score = 0;
+      Cost Score = 0 - SpecCost;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args)
         Score += getSpecializationBonus(A.Formal, A.Actual, Visitor);
-      Score += Visitor.getBonusFromPendingPHIs();
-
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization score = "
-                        << Score << "\n");
 
       // Discard unprofitable specialisations.
-      if (!ForceSpecialization && Score <= SpecCost)
+      if (!ForceSpecialization && Score <= 0)
         continue;
 
       // Create a new specialisation entry.
diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h
index 5ac7f6d1e4e6..7736ba853163 100644
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -55,13 +55,12 @@
 
 #define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_emi_implemented ompt_event_UNIMPLEMENTED
-
+#define ompt_callback_target_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_emi_implemented ompt_event_MAY_ALWAYS
 #define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
 
 #define ompt_callback_device_initialize_implemented ompt_event_MAY_ALWAYS

From c938c0a643200ec844981864ac587bc6c1f576aa Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sat, 2 Sep 2023 16:29:16 +0200
Subject: [PATCH 2/3] Vendor import of llvm-project branch release/17.x
 llvmorg-17.0.0-rc3-79-ga612cb0b81d8.

---
 clang/include/clang/AST/DeclBase.h            |   6 +-
 clang/include/clang/Basic/CodeGenOptions.def  |   1 -
 .../include/clang/Basic/DiagnosticASTKinds.td |   2 +
 clang/include/clang/Basic/DiagnosticGroups.td |   1 +
 .../include/clang/Basic/DiagnosticLexKinds.td |   4 +
 clang/include/clang/Basic/Sanitizers.h        |   4 +
 clang/include/clang/Basic/TargetInfo.h        |   4 +-
 clang/include/clang/Basic/riscv_vector.td     |  52 +---
 clang/include/clang/CodeGen/CGFunctionInfo.h  |  29 +-
 clang/include/clang/Driver/Options.td         |  14 +-
 clang/include/clang/Driver/ToolChain.h        |   2 +-
 clang/include/clang/Sema/Sema.h               |   2 -
 clang/lib/AST/ASTContext.cpp                  |   5 +-
 clang/lib/AST/ExprConstant.cpp                |  27 +-
 clang/lib/Basic/Targets/LoongArch.cpp         |  22 +-
 clang/lib/Basic/Targets/LoongArch.h           |  14 +-
 clang/lib/Basic/Targets/RISCV.cpp             |   4 +-
 clang/lib/CodeGen/ABIInfoImpl.cpp             |  13 +-
 clang/lib/CodeGen/ABIInfoImpl.h               |  14 +-
 clang/lib/CodeGen/BackendUtil.cpp             |  23 +-
 clang/lib/CodeGen/CGCXXABI.cpp                |   3 +-
 clang/lib/CodeGen/CGCall.cpp                  | 244 ++++++++++-------
 clang/lib/CodeGen/CGCall.h                    |  29 ++
 clang/lib/CodeGen/CGClass.cpp                 | 106 +++++++-
 clang/lib/CodeGen/CGCoroutine.cpp             |  33 +++
 clang/lib/CodeGen/CGDebugInfo.cpp             |  13 +-
 clang/lib/CodeGen/CGDebugInfo.h               |   2 +-
 clang/lib/CodeGen/CGDecl.cpp                  |   2 +-
 clang/lib/CodeGen/CGDeclCXX.cpp               |   4 +-
 clang/lib/CodeGen/CGExpr.cpp                  |   9 +-
 clang/lib/CodeGen/CGExprConstant.cpp          |   2 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  11 +-
 clang/lib/CodeGen/CodeGenABITypes.cpp         |   5 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |  26 +-
 clang/lib/CodeGen/CodeGenFunction.h           |  19 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  32 ++-
 clang/lib/CodeGen/CodeGenModule.h             |  20 +-
 clang/lib/CodeGen/CodeGenTypes.h              |  12 +-
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |   2 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp         |   3 +-
 clang/lib/CodeGen/Targets/LoongArch.cpp       |  11 +-
 clang/lib/CodeGen/Targets/RISCV.cpp           |  24 +-
 clang/lib/CodeGen/Targets/X86.cpp             |  16 +-
 clang/lib/Driver/Driver.cpp                   |   9 +-
 clang/lib/Driver/SanitizerArgs.cpp            |  32 +++
 clang/lib/Driver/ToolChain.cpp                |   6 +
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |  50 ++--
 clang/lib/Driver/ToolChains/Arch/LoongArch.h  |   6 +
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |  14 +-
 clang/lib/Driver/ToolChains/Arch/X86.h        |   2 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  31 +--
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  11 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |  22 +-
 clang/lib/Driver/ToolChains/Hexagon.cpp       |   5 +
 clang/lib/Driver/ToolChains/Solaris.cpp       |  41 ++-
 clang/lib/Format/UnwrappedLineParser.cpp      |   5 +-
 clang/lib/Headers/__clang_cuda_math.h         |   2 +-
 .../Headers/__clang_hip_libdevice_declares.h  |   2 +-
 clang/lib/Headers/cpuid.h                     |  10 -
 clang/lib/Interpreter/IncrementalExecutor.cpp |  19 +-
 clang/lib/Lex/LiteralSupport.cpp              |  41 ++-
 clang/lib/Parse/ParseDeclCXX.cpp              |  19 +-
 clang/lib/Parse/ParseTentative.cpp            |   1 +
 clang/lib/Sema/SemaAvailability.cpp           |  12 +
 clang/lib/Sema/SemaCast.cpp                   |   8 +
 clang/lib/Sema/SemaDecl.cpp                   |   3 +-
 clang/lib/Sema/SemaExpr.cpp                   |  95 +++----
 clang/lib/Sema/SemaLookup.cpp                 |  68 +++--
 clang/lib/Sema/TreeTransform.h                |   4 +
 clang/lib/Serialization/ASTReaderDecl.cpp     |  66 +++--
 clang/lib/Serialization/ASTWriterDecl.cpp     |   4 +-
 compiler-rt/lib/asan/asan_interceptors.cpp    |  56 ++--
 compiler-rt/lib/asan/asan_interceptors.h      |   2 -
 compiler-rt/lib/asan/asan_win_dll_thunk.cpp   |   2 +
 compiler-rt/lib/builtins/clear_cache.c        |   2 +-
 compiler-rt/lib/builtins/cpu_model.c          |   5 +-
 compiler-rt/lib/interception/interception.h   |   2 +-
 compiler-rt/lib/msan/msan_interceptors.cpp    |  37 +++
 compiler-rt/lib/profile/InstrProfilingFile.c  |  10 +-
 .../sanitizer_common_interceptors.inc         |  73 +++--
 .../sanitizer_stacktrace_sparc.cpp            |   6 -
 .../sanitizer_unwind_linux_libcdep.cpp        |   6 -
 .../symbolizer/scripts/global_symbols.txt     |   7 +
 libcxx/include/__algorithm/pstl_sort.h        |   1 +
 libcxx/include/__format/format_functions.h    |   3 +
 .../locale_base_api/locale_guard.h            |   1 +
 libcxx/include/__mdspan/layout_left.h         |   2 +-
 libcxx/include/__std_clang_module             | 226 ++++++++++++++++
 .../__type_traits/is_nothrow_constructible.h  |   3 +-
 libcxx/include/__type_traits/remove_cv.h      |   2 +-
 libcxx/include/__type_traits/remove_cvref.h   |   2 +-
 libcxx/include/module.modulemap.in            |  64 ++---
 libcxx/include/sstream                        |  50 ++--
 libcxx/modules/std/atomic.cppm                |   3 -
 libcxx/modules/std/execution.cppm             |   2 +-
 libcxx/modules/std/filesystem.cppm            |   4 +-
 libcxx/src/chrono.cpp                         |   2 +-
 libcxx/src/filesystem/filesystem_clock.cpp    |   2 +-
 libunwind/src/Unwind-EHABI.cpp                |   7 +-
 lld/ELF/Arch/LoongArch.cpp                    |   7 +
 lld/ELF/Arch/PPC.cpp                          |  12 +-
 lld/ELF/Arch/PPC64.cpp                        |  86 ++++--
 lld/ELF/Target.h                              |   1 +
 lld/docs/ReleaseNotes.rst                     |   5 +
 .../GNUstepObjCRuntime/GNUstepObjCRuntime.cpp |  42 ++-
 .../Utility/RegisterContextPOSIX_arm64.cpp    |   4 +
 .../Utility/RegisterContextPOSIX_arm64.h      |   1 +
 .../Process/Utility/RegisterInfoPOSIX_arm64.h |   1 +
 .../RegisterContextPOSIXCore_arm64.cpp        |  14 +
 .../elf-core/RegisterContextPOSIXCore_arm64.h |   1 +
 .../Process/elf-core/RegisterUtilities.h      |   4 +
 llvm/include/llvm/ADT/FunctionExtras.h        |  12 +-
 llvm/include/llvm/ADT/SmallVector.h           |   4 +-
 llvm/include/llvm/Analysis/RegionInfoImpl.h   |   4 +-
 llvm/include/llvm/Analysis/ValueTracking.h    |   4 -
 .../include/llvm/CodeGen/CodeGenPassBuilder.h |   2 +-
 llvm/include/llvm/CodeGen/LowLevelType.h      |   7 +-
 .../llvm/CodeGen/PreISelIntrinsicLowering.h   |   4 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  17 --
 llvm/include/llvm/Object/Wasm.h               |  10 +-
 llvm/include/llvm/ObjectYAML/WasmYAML.h       |   1 +
 llvm/include/llvm/Option/ArgList.h            |   1 +
 llvm/include/llvm/Support/type_traits.h       |  38 ---
 .../llvm/TargetParser/LoongArchTargetParser.h |   8 +-
 .../AggressiveInstCombine.h                   |   2 +-
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h  |   4 -
 llvm/lib/Analysis/ValueTracking.cpp           |   7 -
 llvm/lib/CodeGen/CalcSpillWeights.cpp         |  15 +-
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp |  18 +-
 llvm/lib/CodeGen/InlineSpiller.cpp            |  34 ++-
 llvm/lib/CodeGen/LiveRangeEdit.cpp            |   3 +-
 llvm/lib/CodeGen/LiveRangeShrink.cpp          |   4 +-
 llvm/lib/CodeGen/MachineLICM.cpp              |   4 +
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp |  54 ++--
 llvm/lib/CodeGen/RegAllocGreedy.cpp           |  21 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   6 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   3 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  99 ++++---
 llvm/lib/CodeGen/SplitKit.cpp                 |  17 +-
 llvm/lib/CodeGen/SplitKit.h                   |   7 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |   7 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   2 +-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  17 --
 llvm/lib/LTO/LTO.cpp                          |   7 +-
 llvm/lib/ObjCopy/wasm/WasmObject.h            |   1 +
 llvm/lib/ObjCopy/wasm/WasmReader.cpp          |   4 +-
 llvm/lib/ObjCopy/wasm/WasmWriter.cpp          |  13 +-
 llvm/lib/Object/SymbolSize.cpp                |  17 +-
 llvm/lib/Object/WasmObjectFile.cpp            |   4 +
 llvm/lib/ObjectYAML/WasmEmitter.cpp           |  12 +-
 llvm/lib/ObjectYAML/WasmYAML.cpp              |   1 +
 llvm/lib/Option/ArgList.cpp                   |   7 +
 llvm/lib/TableGen/TGParser.cpp                |   9 +-
 llvm/lib/Target/AArch64/AArch64.td            |   6 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   |  13 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  19 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |   9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  11 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  20 +-
 .../AArch64/AArch64LoadStoreOptimizer.cpp     |   8 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  49 ++--
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   2 +-
 .../AArch64/GISel/AArch64CallLowering.cpp     |   5 +
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   7 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   4 -
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  37 ++-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   2 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  47 +++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   2 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  16 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   4 -
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  16 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |   7 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  45 +--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  19 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   7 -
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  | 136 ++--------
 llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp   | 141 ----------
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  69 ++---
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  39 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  23 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |  17 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |   2 +
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     |   6 +-
 .../lib/Target/BPF/BPFMISimplifyPatchable.cpp |  26 +-
 llvm/lib/Target/BPF/BTFDebug.cpp              |   2 +
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |  50 +++-
 .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp   |  14 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  37 +--
 llvm/lib/Target/PowerPC/PPCInstrFormats.td    |   6 +
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |   9 +
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp    |   4 -
 llvm/lib/Target/PowerPC/PPCScheduleP9.td      |   2 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  15 +-
 .../RISCV/RISCVExpandAtomicPseudoInsts.cpp    |   9 +
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |  39 +--
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  13 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  26 +-
 .../Target/RISCV/RISCVPushPopOptimizer.cpp    |   3 +-
 llvm/lib/Target/Sparc/SparcInstrInfo.td       |  16 ++
 .../SystemZ/SystemZTargetTransformInfo.cpp    |   5 +
 llvm/lib/Target/X86/X86.td                    |   7 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  96 ++++---
 llvm/lib/Target/X86/X86ISelLowering.h         |   2 -
 llvm/lib/Target/X86/X86InstrAVX512.td         |  10 +
 llvm/lib/Target/X86/X86InstrSSE.td            |   5 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  14 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |   1 +
 llvm/lib/TargetParser/Host.cpp                |  10 +-
 .../TargetParser/LoongArchTargetParser.cpp    |  17 +-
 .../AggressiveInstCombine.cpp                 | 217 ++++-----------
 llvm/lib/Transforms/Coroutines/CoroElide.cpp  |  83 ++++--
 .../InstCombine/InstructionCombining.cpp      |   2 +-
 .../ControlHeightReduction.cpp                |  14 +
 .../Instrumentation/GCOVProfiling.cpp         |   4 +-
 .../Scalar/ConstraintElimination.cpp          |   2 +-
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 256 +-----------------
 .../Scalar/TailRecursionElimination.cpp       |   6 +
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |  14 +-
 llvm/tools/llvm-readobj/ELFDumper.cpp         |   2 +-
 221 files changed, 2488 insertions(+), 2070 deletions(-)
 create mode 100644 libcxx/include/__std_clang_module
 delete mode 100644 llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 1b99709ca90d..12137387b676 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1702,7 +1702,7 @@ class DeclContext {
   };
 
   /// Number of non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = 30 };
+  enum { NumFunctionDeclBits = 31 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1714,12 +1714,12 @@ class DeclContext {
     /// For the bits in FunctionDeclBitfields.
     uint64_t : NumFunctionDeclBits;
 
-    /// 21 bits to fit in the remaining available space.
+    /// 20 bits to fit in the remaining available space.
     /// Note that this makes CXXConstructorDeclBitfields take
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 18;
+    uint64_t NumCtorInitializers : 17;
     uint64_t IsInheritingConstructor : 1;
 
     /// Whether this constructor has a trail-allocated explicit specifier.
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 11aec88c5335..d492b8681c5d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -165,7 +165,6 @@ CODEGENOPT(PrepareForThinLTO , 1, 0) ///< Set when -flto=thin is enabled on the
                                      ///< compile step.
 CODEGENOPT(LTOUnit, 1, 0) ///< Emit IR to support LTO unit features (CFI, whole
                           ///< program vtable opt).
-CODEGENOPT(FatLTO, 1, 0) ///< Set when -ffat-lto-objects is enabled.
 CODEGENOPT(EnableSplitLTOUnit, 1, 0) ///< Enable LTO unit splitting to support
 				     /// CFI and traditional whole program
 				     /// devirtualization that require whole
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 566cdc340605..0794ed7ba683 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -70,6 +70,8 @@ def note_consteval_address_accessible : Note<
   "is not a constant expression">;
 def note_constexpr_uninitialized : Note<
   "subobject %0 is not initialized">;
+def note_constexpr_uninitialized_base : Note<
+  "constructor of base class %0 is not called">;
 def note_constexpr_static_local : Note<
   "control flows through the definition of a %select{static|thread_local}0 variable">;
 def note_constexpr_subobject_declared_here : Note<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7b4d415bf064..26bc88a980e4 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -20,6 +20,7 @@ def DeprecatedStaticAnalyzerFlag : DiagGroup<"deprecated-static-analyzer-flag">;
 // Empty DiagGroups are recognized by clang but ignored.
 def ODR : DiagGroup<"odr">;
 def : DiagGroup<"abi">;
+def : DiagGroup<"gnu-empty-initializer">; // Now a C extension, not GNU.
 def AbsoluteValue : DiagGroup<"absolute-value">;
 def MisspelledAssumption : DiagGroup<"misspelled-assumption">;
 def UnknownAssumption : DiagGroup<"unknown-assumption">;
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 0eb270aeea0e..6ad691975bd5 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -285,6 +285,10 @@ def ext_ms_reserved_user_defined_literal : ExtWarn<
 def err_unsupported_string_concat : Error<
   "unsupported non-standard concatenation of string literals">;
 
+def warn_unevaluated_string_prefix : Warning<
+  "encoding prefix '%0' on an unevaluated string literal has no effect"
+  "%select{| and is incompatible with c++2c}1">,
+  InGroup<DiagGroup<"invalid-unevaluated-string">>;
 def err_unevaluated_string_prefix : Error<
   "an unevaluated string literal cannot have an encoding prefix">;
 def err_unevaluated_string_udl : Error<
diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index db53010645ae..4659e45c7883 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -23,7 +23,11 @@
 
 namespace llvm {
 class hash_code;
+class Triple;
+namespace opt {
+class ArgList;
 }
+} // namespace llvm
 
 namespace clang {
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 41ef47eb565b..61be52149341 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1414,7 +1414,9 @@ class TargetInfo : public TransferrableTargetInfo,
 
   /// Identify whether this target supports IFuncs.
   bool supportsIFunc() const {
-    return getTriple().isOSBinFormatELF() && !getTriple().isOSFuchsia();
+    return getTriple().isOSBinFormatELF() &&
+           ((getTriple().isOSLinux() && !getTriple().isMusl()) ||
+            getTriple().isOSFreeBSD());
   }
 
   // Validate the contents of the __builtin_cpu_supports(const char*)
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 7e5889812aec..6adc60031341 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -112,7 +112,7 @@ multiclass RVVIntBinBuiltinSet
 multiclass RVVSlideOneBuiltinSet
     : RVVOutOp1BuiltinSet<NAME, "csil",
                           [["vx", "v", "vve"],
-                           ["vx", "Uv", "UvUve"]]>;
+                           ["vx", "Uv", "UvUvUe"]]>;
 
 multiclass RVVSignedShiftBuiltinSet
     : RVVOutOp1BuiltinSet<NAME, "csil",
@@ -990,56 +990,6 @@ multiclass RVVPseudoVNCVTBuiltin<string IR, string MName, string type_range,
   }
 }
 
-// Define vread_csr&vwrite_csr described in RVV intrinsics doc.
-let HeaderCode =
-[{
-enum RVV_CSR {
-  RVV_VSTART = 0,
-  RVV_VXSAT,
-  RVV_VXRM,
-  RVV_VCSR,
-};
-
-static __inline__ __attribute__((__always_inline__, __nodebug__))
-unsigned long __riscv_vread_csr(enum RVV_CSR __csr) {
-  unsigned long __rv = 0;
-  switch (__csr) {
-    case RVV_VSTART:
-      __asm__ __volatile__ ("csrr\t%0, vstart" : "=r"(__rv) : : "memory");
-      break;
-    case RVV_VXSAT:
-      __asm__ __volatile__ ("csrr\t%0, vxsat" : "=r"(__rv) : : "memory");
-      break;
-    case RVV_VXRM:
-      __asm__ __volatile__ ("csrr\t%0, vxrm" : "=r"(__rv) : : "memory");
-      break;
-    case RVV_VCSR:
-      __asm__ __volatile__ ("csrr\t%0, vcsr" : "=r"(__rv) : : "memory");
-      break;
-  }
-  return __rv;
-}
-
-static __inline__ __attribute__((__always_inline__, __nodebug__))
-void __riscv_vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
-  switch (__csr) {
-    case RVV_VSTART:
-      __asm__ __volatile__ ("csrw\tvstart, %z0" : : "rJ"(__value) : "memory");
-      break;
-    case RVV_VXSAT:
-      __asm__ __volatile__ ("csrw\tvxsat, %z0" : : "rJ"(__value) : "memory");
-      break;
-    case RVV_VXRM:
-      __asm__ __volatile__ ("csrw\tvxrm, %z0" : : "rJ"(__value) : "memory");
-      break;
-    case RVV_VCSR:
-      __asm__ __volatile__ ("csrw\tvcsr, %z0" : : "rJ"(__value) : "memory");
-      break;
-  }
-}
-}] in
-def vread_vwrite_csr: RVVHeader;
-
 let HeaderCode =
 [{
 #define __riscv_vlenb() __builtin_rvv_vlenb()
diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index 39c7a578c8c4..b8971d5793f3 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -567,6 +567,10 @@ class CGFunctionInfo final
   /// Whether this is a chain call.
   unsigned ChainCall : 1;
 
+  /// Whether this function is called by forwarding arguments.
+  /// This doesn't support inalloca or varargs.
+  unsigned DelegateCall : 1;
+
   /// Whether this function is a CMSE nonsecure call
   unsigned CmseNSCall : 1;
 
@@ -616,14 +620,11 @@ class CGFunctionInfo final
   CGFunctionInfo() : Required(RequiredArgs::All) {}
 
 public:
-  static CGFunctionInfo *create(unsigned llvmCC,
-                                bool instanceMethod,
-                                bool chainCall,
-                                const FunctionType::ExtInfo &extInfo,
-                                ArrayRef<ExtParameterInfo> paramInfos,
-                                CanQualType resultType,
-                                ArrayRef<CanQualType> argTypes,
-                                RequiredArgs required);
+  static CGFunctionInfo *
+  create(unsigned llvmCC, bool instanceMethod, bool chainCall,
+         bool delegateCall, const FunctionType::ExtInfo &extInfo,
+         ArrayRef<ExtParameterInfo> paramInfos, CanQualType resultType,
+         ArrayRef<CanQualType> argTypes, RequiredArgs required);
   void operator delete(void *p) { ::operator delete(p); }
 
   // Friending class TrailingObjects is apparently not good enough for MSVC,
@@ -663,6 +664,8 @@ class CGFunctionInfo final
 
   bool isChainCall() const { return ChainCall; }
 
+  bool isDelegateCall() const { return DelegateCall; }
+
   bool isCmseNSCall() const { return CmseNSCall; }
 
   bool isNoReturn() const { return NoReturn; }
@@ -749,6 +752,7 @@ class CGFunctionInfo final
     ID.AddInteger(getASTCallingConvention());
     ID.AddBoolean(InstanceMethod);
     ID.AddBoolean(ChainCall);
+    ID.AddBoolean(DelegateCall);
     ID.AddBoolean(NoReturn);
     ID.AddBoolean(ReturnsRetained);
     ID.AddBoolean(NoCallerSavedRegs);
@@ -766,17 +770,16 @@ class CGFunctionInfo final
     for (const auto &I : arguments())
       I.type.Profile(ID);
   }
-  static void Profile(llvm::FoldingSetNodeID &ID,
-                      bool InstanceMethod,
-                      bool ChainCall,
+  static void Profile(llvm::FoldingSetNodeID &ID, bool InstanceMethod,
+                      bool ChainCall, bool IsDelegateCall,
                       const FunctionType::ExtInfo &info,
                       ArrayRef<ExtParameterInfo> paramInfos,
-                      RequiredArgs required,
-                      CanQualType resultType,
+                      RequiredArgs required, CanQualType resultType,
                       ArrayRef<CanQualType> argTypes) {
     ID.AddInteger(info.getCC());
     ID.AddBoolean(InstanceMethod);
     ID.AddBoolean(ChainCall);
+    ID.AddBoolean(IsDelegateCall);
     ID.AddBoolean(info.getNoReturn());
     ID.AddBoolean(info.getProducesResult());
     ID.AddBoolean(info.getNoCallerSavedRegs());
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 229f6141c750..e04f67bdb1fa 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2375,11 +2375,6 @@ def fthin_link_bitcode_EQ : Joined<["-"], "fthin-link-bitcode=">,
   Flags<[CoreOption, CC1Option]>, Group<f_Group>,
   HelpText<"Write minimized bitcode to <file> for the ThinLTO thin link only">,
   MarshallingInfoString<CodeGenOpts<"ThinLinkBitcodeFile">>;
-defm fat_lto_objects : BoolFOption<"fat-lto-objects",
-  CodeGenOpts<"FatLTO">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Enable">,
-  NegFlag<SetFalse, [CC1Option], "Disable">,
-  BothFlags<[CC1Option], " fat LTO object support">>;
 def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
   Group<f_Group>, Flags<[NoXarchOption, CC1Option, CoreOption]>,
   HelpText<"Set the maximum number of entries to print in a macro expansion backtrace (0 = no limit)">,
@@ -5097,6 +5092,10 @@ def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<
 def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
 def mvzeroupper : Flag<["-"], "mvzeroupper">, Group<m_x86_Features_Group>;
 def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;
+def mno_gather : Flag<["-"], "mno-gather">, Group<m_x86_Features_Group>,
+                 HelpText<"Disable generation of gather instructions in auto-vectorization(x86 only)">;
+def mno_scatter : Flag<["-"], "mno-scatter">, Group<m_x86_Features_Group>,
+                  HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">;
 
 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
@@ -5162,6 +5161,7 @@ defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimi
 defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
+defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
 defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
@@ -7152,6 +7152,10 @@ def _SLASH_QIntel_jcc_erratum : CLFlag<"QIntel-jcc-erratum">,
   Alias<mbranches_within_32B_boundaries>;
 def _SLASH_arm64EC : CLFlag<"arm64EC">,
   HelpText<"Set build target to arm64ec">;
+def : CLFlag<"Qgather-">, Alias<mno_gather>,
+      HelpText<"Disable generation of gather instructions in auto-vectorization(x86 only)">;
+def : CLFlag<"Qscatter-">, Alias<mno_scatter>,
+      HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">;
 
 // Non-aliases:
 
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index e3fcbd9322b0..2e74507f7126 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -561,7 +561,7 @@ class ToolChain {
 
   // Return the DWARF version to emit, in the absence of arguments
   // to the contrary.
-  virtual unsigned GetDefaultDwarfVersion() const { return 5; }
+  virtual unsigned GetDefaultDwarfVersion() const;
 
   // Some toolchains may have different restrictions on the DWARF version and
   // may need to adjust it. E.g. NVPTX may need to enforce DWARF2 even when host
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 3418a37b3077..cfd1c0f977c0 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12694,8 +12694,6 @@ class Sema final {
   QualType CheckBitwiseOperands( // C99 6.5.[10...12]
       ExprResult &LHS, ExprResult &RHS, SourceLocation Loc,
       BinaryOperatorKind Opc);
-  void diagnoseLogicalInsteadOfBitwise(Expr *Op1, Expr *Op2, SourceLocation Loc,
-                                       BinaryOperatorKind Opc);
   QualType CheckLogicalOperands( // C99 6.5.[13,14]
     ExprResult &LHS, ExprResult &RHS, SourceLocation Loc,
     BinaryOperatorKind Opc);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 7acacd7bf4f5..76000156fece 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -9612,9 +9612,8 @@ bool ASTContext::areLaxCompatibleRVVTypes(QualType FirstType,
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
-      // If __riscv_v_fixed_vlen != N do not allow GNU vector lax conversion.
-      if (VecTy->getVectorKind() == VectorType::GenericVector &&
-          getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
+      // If __riscv_v_fixed_vlen != N do not allow vector lax conversion.
+      if (getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
         return false;
 
       // If -flax-vector-conversions=all is specified, the types are
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f1c842e26199..f1bad0c7f7f2 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2418,9 +2418,16 @@ static bool CheckEvaluationResult(CheckEvaluationResultKind CERK,
     if (const CXXRecordDecl *CD = dyn_cast<CXXRecordDecl>(RD)) {
       unsigned BaseIndex = 0;
       for (const CXXBaseSpecifier &BS : CD->bases()) {
-        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(),
-                                   Value.getStructBase(BaseIndex), Kind,
-                                   /*SubobjectDecl=*/nullptr, CheckedTemps))
+        const APValue &BaseValue = Value.getStructBase(BaseIndex);
+        if (!BaseValue.hasValue()) {
+          SourceLocation TypeBeginLoc = BS.getBaseTypeLoc();
+          Info.FFDiag(TypeBeginLoc, diag::note_constexpr_uninitialized_base)
+              << BS.getType() << SourceRange(TypeBeginLoc, BS.getEndLoc());
+          return false;
+        }
+        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(), BaseValue,
+                                   Kind, /*SubobjectDecl=*/nullptr,
+                                   CheckedTemps))
           return false;
         ++BaseIndex;
       }
@@ -15218,14 +15225,6 @@ static bool FastEvaluateAsRValue(const Expr *Exp, Expr::EvalResult &Result,
     return true;
   }
 
-  // FIXME: Evaluating values of large array and record types can cause
-  // performance problems. Only do so in C++11 for now.
-  if (Exp->isPRValue() &&
-      (Exp->getType()->isArrayType() || Exp->getType()->isRecordType()) &&
-      !Ctx.getLangOpts().CPlusPlus11) {
-    IsConst = false;
-    return true;
-  }
   return false;
 }
 
@@ -15467,12 +15466,6 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
     return Name;
   });
 
-  // FIXME: Evaluating initializers for large array and record types can cause
-  // performance problems. Only do so in C++11 for now.
-  if (isPRValue() && (getType()->isArrayType() || getType()->isRecordType()) &&
-      !Ctx.getLangOpts().CPlusPlus11)
-    return false;
-
   Expr::EvalStatus EStatus;
   EStatus.Diag = &Notes;
 
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index f08e5e732b03..4448a2ae10a1 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -199,18 +199,14 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_frlen", "0");
 
   // Define __loongarch_arch.
-  StringRef Arch = llvm::LoongArch::getArch();
-  if (Arch.empty())
-    Arch = llvm::LoongArch::getDefaultArch(GRLen == 64);
-  if (!Arch.empty())
-    Builder.defineMacro("__loongarch_arch", Arch);
+  StringRef ArchName = getCPU();
+  Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
 
   // Define __loongarch_tune.
-  StringRef TuneCPU = llvm::LoongArch::getTuneCPU();
+  StringRef TuneCPU = getTargetOpts().TuneCPU;
   if (TuneCPU.empty())
-    TuneCPU = Arch;
-  if (!TuneCPU.empty())
-    Builder.defineMacro("__loongarch_tune", TuneCPU);
+    TuneCPU = ArchName;
+  Builder.defineMacro("__loongarch_tune", Twine('"') + TuneCPU + Twine('"'));
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -283,11 +279,11 @@ bool LoongArchTargetInfo::handleTargetFeatures(
   return true;
 }
 
-bool LoongArchTargetInfo::isValidTuneCPUName(StringRef Name) const {
-  return llvm::LoongArch::isValidTuneCPUName(Name);
+bool LoongArchTargetInfo::isValidCPUName(StringRef Name) const {
+  return llvm::LoongArch::isValidCPUName(Name);
 }
 
-void LoongArchTargetInfo::fillValidTuneCPUList(
+void LoongArchTargetInfo::fillValidCPUList(
     SmallVectorImpl<StringRef> &Values) const {
-  llvm::LoongArch::fillValidTuneCPUList(Values);
+  llvm::LoongArch::fillValidCPUList(Values);
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 60d545566b30..34143f462a24 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -24,6 +24,7 @@ namespace targets {
 class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
 protected:
   std::string ABI;
+  std::string CPU;
   bool HasFeatureD;
   bool HasFeatureF;
 
@@ -40,6 +41,15 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
     WIntType = UnsignedInt;
   }
 
+  bool setCPU(const std::string &Name) override {
+    if (!isValidCPUName(Name))
+      return false;
+    CPU = Name;
+    return true;
+  }
+
+  StringRef getCPU() const { return CPU; }
+
   StringRef getABI() const override { return ABI; }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -81,8 +91,8 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
 
   bool hasFeature(StringRef Feature) const override;
 
-  bool isValidTuneCPUName(StringRef Name) const override;
-  void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
+  bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 };
 
 class LLVM_LIBRARY_VISIBILITY LoongArch32TargetInfo
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 94c894dfec0b..d55ab76395c8 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -196,8 +196,8 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if (ISAInfo->hasExtension("zve32x")) {
     Builder.defineMacro("__riscv_vector");
-    // Currently we support the v0.11 RISC-V V intrinsics.
-    Builder.defineMacro("__riscv_v_intrinsic", Twine(getVersionValue(0, 11)));
+    // Currently we support the v0.12 RISC-V V intrinsics.
+    Builder.defineMacro("__riscv_v_intrinsic", Twine(getVersionValue(0, 12)));
   }
 
   auto VScale = getVScaleRange(Opts);
diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 7c30cecfdb9b..2b20d5a13346 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -246,7 +246,7 @@ Address CodeGen::emitMergePHI(CodeGenFunction &CGF, Address Addr1,
 }
 
 bool CodeGen::isEmptyField(ASTContext &Context, const FieldDecl *FD,
-                           bool AllowArrays) {
+                           bool AllowArrays, bool AsIfNoUniqueAddr) {
   if (FD->isUnnamedBitfield())
     return true;
 
@@ -280,13 +280,14 @@ bool CodeGen::isEmptyField(ASTContext &Context, const FieldDecl *FD,
   // not arrays of records, so we must also check whether we stripped off an
   // array type above.
   if (isa<CXXRecordDecl>(RT->getDecl()) &&
-      (WasArray || !FD->hasAttr<NoUniqueAddressAttr>()))
+      (WasArray || (!AsIfNoUniqueAddr && !FD->hasAttr<NoUniqueAddressAttr>())))
     return false;
 
-  return isEmptyRecord(Context, FT, AllowArrays);
+  return isEmptyRecord(Context, FT, AllowArrays, AsIfNoUniqueAddr);
 }
 
-bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays) {
+bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays,
+                            bool AsIfNoUniqueAddr) {
   const RecordType *RT = T->getAs<RecordType>();
   if (!RT)
     return false;
@@ -297,11 +298,11 @@ bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays) {
   // If this is a C++ record, check the bases first.
   if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD))
     for (const auto &I : CXXRD->bases())
-      if (!isEmptyRecord(Context, I.getType(), true))
+      if (!isEmptyRecord(Context, I.getType(), true, AsIfNoUniqueAddr))
         return false;
 
   for (const auto *I : RD->fields())
-    if (!isEmptyField(Context, I, AllowArrays))
+    if (!isEmptyField(Context, I, AllowArrays, AsIfNoUniqueAddr))
       return false;
   return true;
 }
diff --git a/clang/lib/CodeGen/ABIInfoImpl.h b/clang/lib/CodeGen/ABIInfoImpl.h
index 5f0cc289af68..afde08ba100c 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.h
+++ b/clang/lib/CodeGen/ABIInfoImpl.h
@@ -122,13 +122,19 @@ Address emitMergePHI(CodeGenFunction &CGF, Address Addr1,
                      llvm::BasicBlock *Block2, const llvm::Twine &Name = "");
 
 /// isEmptyField - Return true iff a the field is "empty", that is it
-/// is an unnamed bit-field or an (array of) empty record(s).
-bool isEmptyField(ASTContext &Context, const FieldDecl *FD, bool AllowArrays);
+/// is an unnamed bit-field or an (array of) empty record(s). If
+/// AsIfNoUniqueAddr is true, then C++ record fields are considered empty if
+/// the [[no_unique_address]] attribute would have made them empty.
+bool isEmptyField(ASTContext &Context, const FieldDecl *FD, bool AllowArrays,
+                  bool AsIfNoUniqueAddr = false);
 
 /// isEmptyRecord - Return true iff a structure contains only empty
 /// fields. Note that a structure with a flexible array member is not
-/// considered empty.
-bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays);
+/// considered empty. If AsIfNoUniqueAddr is true, then C++ record fields are
+/// considered empty if the [[no_unique_address]] attribute would have made
+/// them empty.
+bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays,
+                   bool AsIfNoUniqueAddr = false);
 
 /// isSingleElementStruct - Determine if a structure is a "single
 /// element struct", i.e. it has exactly one non-empty field or
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index cda03d69522d..483f3e787a78 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -55,7 +55,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/IPO/EmbedBitcodePass.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
@@ -1016,12 +1015,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
           });
     }
 
-    bool IsThinOrUnifiedLTO = IsThinLTO || (IsLTO && CodeGenOpts.UnifiedLTO);
-    if (CodeGenOpts.FatLTO) {
-      MPM = PB.buildFatLTODefaultPipeline(Level, IsThinOrUnifiedLTO,
-                                          IsThinOrUnifiedLTO ||
-                                              shouldEmitRegularLTOSummary());
-    } else if (IsThinOrUnifiedLTO) {
+    if (IsThinLTO || (IsLTO && CodeGenOpts.UnifiedLTO)) {
       MPM = PB.buildThinLTOPreLinkDefaultPipeline(Level);
     } else if (IsLTO) {
       MPM = PB.buildLTOPreLinkDefaultPipeline(Level);
@@ -1077,21 +1071,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
                                     EmitLTOSummary));
     }
   }
-  if (CodeGenOpts.FatLTO) {
-    // Set module flags, like EnableSplitLTOUnit and UnifiedLTO, since FatLTO
-    // uses a different action than Backend_EmitBC or Backend_EmitLL.
-    bool IsThinOrUnifiedLTO =
-        CodeGenOpts.PrepareForThinLTO ||
-        (CodeGenOpts.PrepareForLTO && CodeGenOpts.UnifiedLTO);
-    if (!TheModule->getModuleFlag("ThinLTO"))
-      TheModule->addModuleFlag(Module::Error, "ThinLTO",
-                               uint32_t(IsThinOrUnifiedLTO));
-    if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
-      TheModule->addModuleFlag(Module::Error, "EnableSplitLTOUnit",
-                               uint32_t(CodeGenOpts.EnableSplitLTOUnit));
-    if (CodeGenOpts.UnifiedLTO && !TheModule->getModuleFlag("UnifiedLTO"))
-      TheModule->addModuleFlag(Module::Error, "UnifiedLTO", uint32_t(1));
-  }
 
   // Now that we have all of the passes ready, run them.
   {
diff --git a/clang/lib/CodeGen/CGCXXABI.cpp b/clang/lib/CodeGen/CGCXXABI.cpp
index 7b77dd7875bc..4df6f6505ef6 100644
--- a/clang/lib/CodeGen/CGCXXABI.cpp
+++ b/clang/lib/CodeGen/CGCXXABI.cpp
@@ -312,8 +312,7 @@ void CGCXXABI::setCXXDestructorDLLStorage(llvm::GlobalValue *GV,
 llvm::GlobalValue::LinkageTypes CGCXXABI::getCXXDestructorLinkage(
     GVALinkage Linkage, const CXXDestructorDecl *Dtor, CXXDtorType DT) const {
   // Delegate back to CGM by default.
-  return CGM.getLLVMLinkageForDeclarator(Dtor, Linkage,
-                                         /*IsConstantVariable=*/false);
+  return CGM.getLLVMLinkageForDeclarator(Dtor, Linkage);
 }
 
 bool CGCXXABI::NeedsVTTParameter(GlobalDecl GD) {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index bd272e016e92..0d1e9ad439b7 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -13,6 +13,7 @@
 
 #include "CGCall.h"
 #include "ABIInfo.h"
+#include "ABIInfoImpl.h"
 #include "CGBlocks.h"
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
@@ -112,8 +113,7 @@ CodeGenTypes::arrangeFreeFunctionType(CanQual<FunctionNoProtoType> FTNP) {
   // When translating an unprototyped function type, always use a
   // variadic type.
   return arrangeLLVMFunctionInfo(FTNP->getReturnType().getUnqualifiedType(),
-                                 /*instanceMethod=*/false,
-                                 /*chainCall=*/false, std::nullopt,
+                                 FnInfoOpts::None, std::nullopt,
                                  FTNP->getExtInfo(), {}, RequiredArgs(0));
 }
 
@@ -189,10 +189,10 @@ arrangeLLVMFunctionInfo(CodeGenTypes &CGT, bool instanceMethod,
   appendParameterTypes(CGT, prefix, paramInfos, FTP);
   CanQualType resultType = FTP->getReturnType().getUnqualifiedType();
 
-  return CGT.arrangeLLVMFunctionInfo(resultType, instanceMethod,
-                                     /*chainCall=*/false, prefix,
-                                     FTP->getExtInfo(), paramInfos,
-                                     Required);
+  FnInfoOpts opts =
+      instanceMethod ? FnInfoOpts::IsInstanceMethod : FnInfoOpts::None;
+  return CGT.arrangeLLVMFunctionInfo(resultType, opts, prefix,
+                                     FTP->getExtInfo(), paramInfos, Required);
 }
 
 /// Arrange the argument and result information for a value of the
@@ -271,7 +271,7 @@ CodeGenTypes::arrangeCXXMethodType(const CXXRecordDecl *RD,
   argTypes.push_back(DeriveThisType(RD, MD));
 
   return ::arrangeLLVMFunctionInfo(
-      *this, true, argTypes,
+      *this, /*instanceMethod=*/true, argTypes,
       FTP->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
 }
 
@@ -363,9 +363,8 @@ CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) {
                                : TheCXXABI.hasMostDerivedReturn(GD)
                                      ? CGM.getContext().VoidPtrTy
                                      : Context.VoidTy;
-  return arrangeLLVMFunctionInfo(resultType, /*instanceMethod=*/true,
-                                 /*chainCall=*/false, argTypes, extInfo,
-                                 paramInfos, required);
+  return arrangeLLVMFunctionInfo(resultType, FnInfoOpts::IsInstanceMethod,
+                                 argTypes, extInfo, paramInfos, required);
 }
 
 static SmallVector<CanQualType, 16>
@@ -439,9 +438,9 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args,
     addExtParameterInfosForCall(ParamInfos, FPT.getTypePtr(), TotalPrefixArgs,
                                 ArgTypes.size());
   }
-  return arrangeLLVMFunctionInfo(ResultType, /*instanceMethod=*/true,
-                                 /*chainCall=*/false, ArgTypes, Info,
-                                 ParamInfos, Required);
+
+  return arrangeLLVMFunctionInfo(ResultType, FnInfoOpts::IsInstanceMethod,
+                                 ArgTypes, Info, ParamInfos, Required);
 }
 
 /// Arrange the argument and result information for the declaration or
@@ -460,10 +459,9 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
   // When declaring a function without a prototype, always use a
   // non-variadic type.
   if (CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>()) {
-    return arrangeLLVMFunctionInfo(
-        noProto->getReturnType(), /*instanceMethod=*/false,
-        /*chainCall=*/false, std::nullopt, noProto->getExtInfo(), {},
-        RequiredArgs::All);
+    return arrangeLLVMFunctionInfo(noProto->getReturnType(), FnInfoOpts::None,
+                                   std::nullopt, noProto->getExtInfo(), {},
+                                   RequiredArgs::All);
   }
 
   return arrangeFreeFunctionType(FTy.castAs<FunctionProtoType>());
@@ -512,9 +510,9 @@ CodeGenTypes::arrangeObjCMessageSendSignature(const ObjCMethodDecl *MD,
   RequiredArgs required =
     (MD->isVariadic() ? RequiredArgs(argTys.size()) : RequiredArgs::All);
 
-  return arrangeLLVMFunctionInfo(
-      GetReturnType(MD->getReturnType()), /*instanceMethod=*/false,
-      /*chainCall=*/false, argTys, einfo, extParamInfos, required);
+  return arrangeLLVMFunctionInfo(GetReturnType(MD->getReturnType()),
+                                 FnInfoOpts::None, argTys, einfo, extParamInfos,
+                                 required);
 }
 
 const CGFunctionInfo &
@@ -523,9 +521,8 @@ CodeGenTypes::arrangeUnprototypedObjCMessageSend(QualType returnType,
   auto argTypes = getArgTypesForCall(Context, args);
   FunctionType::ExtInfo einfo;
 
-  return arrangeLLVMFunctionInfo(
-      GetReturnType(returnType), /*instanceMethod=*/false,
-      /*chainCall=*/false, argTypes, einfo, {}, RequiredArgs::All);
+  return arrangeLLVMFunctionInfo(GetReturnType(returnType), FnInfoOpts::None,
+                                 argTypes, einfo, {}, RequiredArgs::All);
 }
 
 const CGFunctionInfo &
@@ -550,8 +547,7 @@ CodeGenTypes::arrangeUnprototypedMustTailThunk(const CXXMethodDecl *MD) {
   assert(MD->isVirtual() && "only methods have thunks");
   CanQual<FunctionProtoType> FTP = GetFormalType(MD);
   CanQualType ArgTys[] = {DeriveThisType(MD->getParent(), MD)};
-  return arrangeLLVMFunctionInfo(Context.VoidTy, /*instanceMethod=*/false,
-                                 /*chainCall=*/false, ArgTys,
+  return arrangeLLVMFunctionInfo(Context.VoidTy, FnInfoOpts::None, ArgTys,
                                  FTP->getExtInfo(), {}, RequiredArgs(1));
 }
 
@@ -570,9 +566,8 @@ CodeGenTypes::arrangeMSCtorClosure(const CXXConstructorDecl *CD,
     ArgTys.push_back(Context.IntTy);
   CallingConv CC = Context.getDefaultCallingConvention(
       /*IsVariadic=*/false, /*IsCXXMethod=*/true);
-  return arrangeLLVMFunctionInfo(Context.VoidTy, /*instanceMethod=*/true,
-                                 /*chainCall=*/false, ArgTys,
-                                 FunctionType::ExtInfo(CC), {},
+  return arrangeLLVMFunctionInfo(Context.VoidTy, FnInfoOpts::IsInstanceMethod,
+                                 ArgTys, FunctionType::ExtInfo(CC), {},
                                  RequiredArgs::All);
 }
 
@@ -616,10 +611,10 @@ arrangeFreeFunctionLikeCall(CodeGenTypes &CGT,
   SmallVector<CanQualType, 16> argTypes;
   for (const auto &arg : args)
     argTypes.push_back(CGT.getContext().getCanonicalParamType(arg.Ty));
+  FnInfoOpts opts = chainCall ? FnInfoOpts::IsChainCall : FnInfoOpts::None;
   return CGT.arrangeLLVMFunctionInfo(GetReturnType(fnType->getReturnType()),
-                                     /*instanceMethod=*/false, chainCall,
-                                     argTypes, fnType->getExtInfo(), paramInfos,
-                                     required);
+                                     opts, argTypes, fnType->getExtInfo(),
+                                     paramInfos, required);
 }
 
 /// Figure out the rules for calling a function with the given formal
@@ -650,8 +645,8 @@ CodeGenTypes::arrangeBlockFunctionDeclaration(const FunctionProtoType *proto,
   auto argTypes = getArgTypesForDeclaration(Context, params);
 
   return arrangeLLVMFunctionInfo(GetReturnType(proto->getReturnType()),
-                                 /*instanceMethod*/ false, /*chainCall*/ false,
-                                 argTypes, proto->getExtInfo(), paramInfos,
+                                 FnInfoOpts::None, argTypes,
+                                 proto->getExtInfo(), paramInfos,
                                  RequiredArgs::forPrototypePlus(proto, 1));
 }
 
@@ -662,10 +657,9 @@ CodeGenTypes::arrangeBuiltinFunctionCall(QualType resultType,
   SmallVector<CanQualType, 16> argTypes;
   for (const auto &Arg : args)
     argTypes.push_back(Context.getCanonicalParamType(Arg.Ty));
-  return arrangeLLVMFunctionInfo(
-      GetReturnType(resultType), /*instanceMethod=*/false,
-      /*chainCall=*/false, argTypes, FunctionType::ExtInfo(),
-      /*paramInfos=*/ {}, RequiredArgs::All);
+  return arrangeLLVMFunctionInfo(GetReturnType(resultType), FnInfoOpts::None,
+                                 argTypes, FunctionType::ExtInfo(),
+                                 /*paramInfos=*/{}, RequiredArgs::All);
 }
 
 const CGFunctionInfo &
@@ -673,17 +667,17 @@ CodeGenTypes::arrangeBuiltinFunctionDeclaration(QualType resultType,
                                                 const FunctionArgList &args) {
   auto argTypes = getArgTypesForDeclaration(Context, args);
 
-  return arrangeLLVMFunctionInfo(
-      GetReturnType(resultType), /*instanceMethod=*/false, /*chainCall=*/false,
-      argTypes, FunctionType::ExtInfo(), {}, RequiredArgs::All);
+  return arrangeLLVMFunctionInfo(GetReturnType(resultType), FnInfoOpts::None,
+                                 argTypes, FunctionType::ExtInfo(), {},
+                                 RequiredArgs::All);
 }
 
 const CGFunctionInfo &
 CodeGenTypes::arrangeBuiltinFunctionDeclaration(CanQualType resultType,
                                               ArrayRef<CanQualType> argTypes) {
-  return arrangeLLVMFunctionInfo(
-      resultType, /*instanceMethod=*/false, /*chainCall=*/false,
-      argTypes, FunctionType::ExtInfo(), {}, RequiredArgs::All);
+  return arrangeLLVMFunctionInfo(resultType, FnInfoOpts::None, argTypes,
+                                 FunctionType::ExtInfo(), {},
+                                 RequiredArgs::All);
 }
 
 /// Arrange a call to a C++ method, passing the given arguments.
@@ -706,15 +700,15 @@ CodeGenTypes::arrangeCXXMethodCall(const CallArgList &args,
   auto argTypes = getArgTypesForCall(Context, args);
 
   FunctionType::ExtInfo info = proto->getExtInfo();
-  return arrangeLLVMFunctionInfo(
-      GetReturnType(proto->getReturnType()), /*instanceMethod=*/true,
-      /*chainCall=*/false, argTypes, info, paramInfos, required);
+  return arrangeLLVMFunctionInfo(GetReturnType(proto->getReturnType()),
+                                 FnInfoOpts::IsInstanceMethod, argTypes, info,
+                                 paramInfos, required);
 }
 
 const CGFunctionInfo &CodeGenTypes::arrangeNullaryFunction() {
-  return arrangeLLVMFunctionInfo(
-      getContext().VoidTy, /*instanceMethod=*/false, /*chainCall=*/false,
-      std::nullopt, FunctionType::ExtInfo(), {}, RequiredArgs::All);
+  return arrangeLLVMFunctionInfo(getContext().VoidTy, FnInfoOpts::None,
+                                 std::nullopt, FunctionType::ExtInfo(), {},
+                                 RequiredArgs::All);
 }
 
 const CGFunctionInfo &
@@ -734,12 +728,15 @@ CodeGenTypes::arrangeCall(const CGFunctionInfo &signature,
   auto argTypes = getArgTypesForCall(Context, args);
 
   assert(signature.getRequiredArgs().allowsOptionalArgs());
-  return arrangeLLVMFunctionInfo(signature.getReturnType(),
-                                 signature.isInstanceMethod(),
-                                 signature.isChainCall(),
-                                 argTypes,
-                                 signature.getExtInfo(),
-                                 paramInfos,
+  FnInfoOpts opts = FnInfoOpts::None;
+  if (signature.isInstanceMethod())
+    opts |= FnInfoOpts::IsInstanceMethod;
+  if (signature.isChainCall())
+    opts |= FnInfoOpts::IsChainCall;
+  if (signature.isDelegateCall())
+    opts |= FnInfoOpts::IsDelegateCall;
+  return arrangeLLVMFunctionInfo(signature.getReturnType(), opts, argTypes,
+                                 signature.getExtInfo(), paramInfos,
                                  signature.getRequiredArgs());
 }
 
@@ -752,21 +749,24 @@ void computeSPIRKernelABIInfo(CodeGenModule &CGM, CGFunctionInfo &FI);
 /// Arrange the argument and result information for an abstract value
 /// of a given function type.  This is the method which all of the
 /// above functions ultimately defer to.
-const CGFunctionInfo &
-CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
-                                      bool instanceMethod,
-                                      bool chainCall,
-                                      ArrayRef<CanQualType> argTypes,
-                                      FunctionType::ExtInfo info,
-                     ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
-                                      RequiredArgs required) {
+const CGFunctionInfo &CodeGenTypes::arrangeLLVMFunctionInfo(
+    CanQualType resultType, FnInfoOpts opts, ArrayRef<CanQualType> argTypes,
+    FunctionType::ExtInfo info,
+    ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+    RequiredArgs required) {
   assert(llvm::all_of(argTypes,
                       [](CanQualType T) { return T.isCanonicalAsParam(); }));
 
   // Lookup or create unique function info.
   llvm::FoldingSetNodeID ID;
-  CGFunctionInfo::Profile(ID, instanceMethod, chainCall, info, paramInfos,
-                          required, resultType, argTypes);
+  bool isInstanceMethod =
+      (opts & FnInfoOpts::IsInstanceMethod) == FnInfoOpts::IsInstanceMethod;
+  bool isChainCall =
+      (opts & FnInfoOpts::IsChainCall) == FnInfoOpts::IsChainCall;
+  bool isDelegateCall =
+      (opts & FnInfoOpts::IsDelegateCall) == FnInfoOpts::IsDelegateCall;
+  CGFunctionInfo::Profile(ID, isInstanceMethod, isChainCall, isDelegateCall,
+                          info, paramInfos, required, resultType, argTypes);
 
   void *insertPos = nullptr;
   CGFunctionInfo *FI = FunctionInfos.FindNodeOrInsertPos(ID, insertPos);
@@ -776,8 +776,8 @@ CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
   unsigned CC = ClangCallConvToLLVMCallConv(info.getCC());
 
   // Construct the function info.  We co-allocate the ArgInfos.
-  FI = CGFunctionInfo::create(CC, instanceMethod, chainCall, info,
-                              paramInfos, resultType, argTypes, required);
+  FI = CGFunctionInfo::create(CC, isInstanceMethod, isChainCall, isDelegateCall,
+                              info, paramInfos, resultType, argTypes, required);
   FunctionInfos.InsertNode(FI, insertPos);
 
   bool inserted = FunctionsBeingProcessed.insert(FI).second;
@@ -812,9 +812,8 @@ CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
   return *FI;
 }
 
-CGFunctionInfo *CGFunctionInfo::create(unsigned llvmCC,
-                                       bool instanceMethod,
-                                       bool chainCall,
+CGFunctionInfo *CGFunctionInfo::create(unsigned llvmCC, bool instanceMethod,
+                                       bool chainCall, bool delegateCall,
                                        const FunctionType::ExtInfo &info,
                                        ArrayRef<ExtParameterInfo> paramInfos,
                                        CanQualType resultType,
@@ -834,6 +833,7 @@ CGFunctionInfo *CGFunctionInfo::create(unsigned llvmCC,
   FI->ASTCallingConvention = info.getCC();
   FI->InstanceMethod = instanceMethod;
   FI->ChainCall = chainCall;
+  FI->DelegateCall = delegateCall;
   FI->CmseNSCall = info.getCmseNSCall();
   FI->NoReturn = info.getNoReturn();
   FI->ReturnsRetained = info.getProducesResult();
@@ -3989,10 +3989,6 @@ void CodeGenFunction::EmitDelegateCallArg(CallArgList &args,
 
   QualType type = param->getType();
 
-  if (isInAllocaArgument(CGM.getCXXABI(), type)) {
-    CGM.ErrorUnsupported(param, "forwarded non-trivially copyable parameter");
-  }
-
   // GetAddrOfLocalVar returns a pointer-to-pointer for references,
   // but the argument needs to be the original pointer.
   if (type->isReferenceType()) {
@@ -5105,7 +5101,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                "indirect argument must be in alloca address space");
 
         bool NeedCopy = false;
-
         if (Addr.getAlignment() < Align &&
             llvm::getOrEnforceKnownAlignment(V, Align.getAsAlign(), *TD) <
                 Align.getAsAlign()) {
@@ -5244,30 +5239,50 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && ArgInfo.getCanBeFlattened()) {
         llvm::Type *SrcTy = Src.getElementType();
-        uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
-
-        // If the source type is smaller than the destination type of the
-        // coerce-to logic, copy the source value into a temp alloca the size
-        // of the destination type to allow loading all of it. The bits past
-        // the source value are left undef.
-        if (SrcSize < DstSize) {
-          Address TempAlloca
-            = CreateTempAlloca(STy, Src.getAlignment(),
-                               Src.getName() + ".coerce");
-          Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
-          Src = TempAlloca;
+        llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        if (SrcTypeSize.isScalable()) {
+          assert(STy->containsHomogeneousScalableVectorTypes() &&
+                 "ABI only supports structure with homogeneous scalable vector "
+                 "type");
+          assert(SrcTypeSize == DstTypeSize &&
+                 "Only allow non-fractional movement of structure with "
+                 "homogeneous scalable vector type");
+          assert(NumIRArgs == STy->getNumElements());
+
+          llvm::Value *StoredStructValue =
+              Builder.CreateLoad(Src, Src.getName() + ".tuple");
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            llvm::Value *Extract = Builder.CreateExtractValue(
+                StoredStructValue, i, Src.getName() + ".extract" + Twine(i));
+            IRCallArgs[FirstIRArg + i] = Extract;
+          }
         } else {
-          Src = Src.withElementType(STy);
-        }
+          uint64_t SrcSize = SrcTypeSize.getFixedValue();
+          uint64_t DstSize = DstTypeSize.getFixedValue();
+
+          // If the source type is smaller than the destination type of the
+          // coerce-to logic, copy the source value into a temp alloca the size
+          // of the destination type to allow loading all of it. The bits past
+          // the source value are left undef.
+          if (SrcSize < DstSize) {
+            Address TempAlloca = CreateTempAlloca(STy, Src.getAlignment(),
+                                                  Src.getName() + ".coerce");
+            Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
+            Src = TempAlloca;
+          } else {
+            Src = Src.withElementType(STy);
+          }
 
-        assert(NumIRArgs == STy->getNumElements());
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Address EltPtr = Builder.CreateStructGEP(Src, i);
-          llvm::Value *LI = Builder.CreateLoad(EltPtr);
-          if (ArgHasMaybeUndefAttr)
-            LI = Builder.CreateFreeze(LI);
-          IRCallArgs[FirstIRArg + i] = LI;
+          assert(NumIRArgs == STy->getNumElements());
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            Address EltPtr = Builder.CreateStructGEP(Src, i);
+            llvm::Value *LI = Builder.CreateLoad(EltPtr);
+            if (ArgHasMaybeUndefAttr)
+              LI = Builder.CreateFreeze(LI);
+            IRCallArgs[FirstIRArg + i] = LI;
+          }
         }
       } else {
         // In the simple case, just pass the coerced loaded value.
@@ -5472,6 +5487,30 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
   }
 
+  // The await_suspend call performed by co_await is essentially asynchronous
+  // to the execution of the coroutine. Inlining it normally into an unsplit
+  // coroutine can cause miscompilation because the coroutine CFG misrepresents
+  // the true control flow of the program: things that happen in the
+  // await_suspend are not guaranteed to happen prior to the resumption of the
+  // coroutine, and things that happen after the resumption of the coroutine
+  // (including its exit and the potential deallocation of the coroutine frame)
+  // are not guaranteed to happen only after the end of await_suspend.
+  //
+  // The short-term solution to this problem is to mark the call as uninlinable.
+  // But we don't want to do this if the call is known to be trivial, which is
+  // very common.
+  //
+  // The long-term solution may introduce patterns like:
+  //
+  //  call @llvm.coro.await_suspend(ptr %awaiter, ptr %handle,
+  //                                ptr @awaitSuspendFn)
+  //
+  // Then it is much easier to perform the safety analysis in the middle end.
+  // If it is safe to inline the call to awaitSuspend, we can replace it in the
+  // CoroEarly pass. Otherwise we could replace it in the CoroSplit pass.
+  if (inSuspendBlock() && mayCoroHandleEscape())
+    Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
+
   // Disable inlining inside SEH __try blocks.
   if (isSEHTryScope()) {
     Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
@@ -5765,9 +5804,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         DestIsVolatile = false;
       }
 
-      // If the value is offset in memory, apply the offset now.
-      Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
-      CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+      // An empty record can overlap other data (if declared with
+      // no_unique_address); omit the store for such types - as there is no
+      // actual data to store.
+      if (!isEmptyRecord(getContext(), RetTy, true)) {
+        // If the value is offset in memory, apply the offset now.
+        Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
+        CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+      }
 
       return convertTempToRValue(DestPtr, RetTy, SourceLocation());
     }
diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index eaaf10c4eec6..65a7d8e83288 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -383,6 +383,35 @@ void mergeDefaultFunctionDefinitionAttributes(llvm::Function &F,
                                               const TargetOptions &TargetOpts,
                                               bool WillInternalize);
 
+enum class FnInfoOpts {
+  None = 0,
+  IsInstanceMethod = 1 << 0,
+  IsChainCall = 1 << 1,
+  IsDelegateCall = 1 << 2,
+};
+
+inline FnInfoOpts operator|(FnInfoOpts A, FnInfoOpts B) {
+  return static_cast<FnInfoOpts>(
+      static_cast<std::underlying_type_t<FnInfoOpts>>(A) |
+      static_cast<std::underlying_type_t<FnInfoOpts>>(B));
+}
+
+inline FnInfoOpts operator&(FnInfoOpts A, FnInfoOpts B) {
+  return static_cast<FnInfoOpts>(
+      static_cast<std::underlying_type_t<FnInfoOpts>>(A) &
+      static_cast<std::underlying_type_t<FnInfoOpts>>(B));
+}
+
+inline FnInfoOpts operator|=(FnInfoOpts A, FnInfoOpts B) {
+  A = A | B;
+  return A;
+}
+
+inline FnInfoOpts operator&=(FnInfoOpts A, FnInfoOpts B) {
+  A = A & B;
+  return A;
+}
+
 } // end namespace CodeGen
 } // end namespace clang
 
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 93e7b54fca04..6ef7d12372d0 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2927,14 +2927,16 @@ llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
 }
 
 void CodeGenFunction::EmitForwardingCallToLambda(
-                                      const CXXMethodDecl *callOperator,
-                                      CallArgList &callArgs) {
+    const CXXMethodDecl *callOperator, CallArgList &callArgs,
+    const CGFunctionInfo *calleeFnInfo, llvm::Constant *calleePtr) {
   // Get the address of the call operator.
-  const CGFunctionInfo &calleeFnInfo =
-    CGM.getTypes().arrangeCXXMethodDeclaration(callOperator);
-  llvm::Constant *calleePtr =
-    CGM.GetAddrOfFunction(GlobalDecl(callOperator),
-                          CGM.getTypes().GetFunctionType(calleeFnInfo));
+  if (!calleeFnInfo)
+    calleeFnInfo = &CGM.getTypes().arrangeCXXMethodDeclaration(callOperator);
+
+  if (!calleePtr)
+    calleePtr =
+        CGM.GetAddrOfFunction(GlobalDecl(callOperator),
+                              CGM.getTypes().GetFunctionType(*calleeFnInfo));
 
   // Prepare the return slot.
   const FunctionProtoType *FPT =
@@ -2942,8 +2944,8 @@ void CodeGenFunction::EmitForwardingCallToLambda(
   QualType resultType = FPT->getReturnType();
   ReturnValueSlot returnSlot;
   if (!resultType->isVoidType() &&
-      calleeFnInfo.getReturnInfo().getKind() == ABIArgInfo::Indirect &&
-      !hasScalarEvaluationKind(calleeFnInfo.getReturnType()))
+      calleeFnInfo->getReturnInfo().getKind() == ABIArgInfo::Indirect &&
+      !hasScalarEvaluationKind(calleeFnInfo->getReturnType()))
     returnSlot =
         ReturnValueSlot(ReturnValue, resultType.isVolatileQualified(),
                         /*IsUnused=*/false, /*IsExternallyDestructed=*/true);
@@ -2954,7 +2956,7 @@ void CodeGenFunction::EmitForwardingCallToLambda(
 
   // Now emit our call.
   auto callee = CGCallee::forDirect(calleePtr, GlobalDecl(callOperator));
-  RValue RV = EmitCall(calleeFnInfo, callee, returnSlot, callArgs);
+  RValue RV = EmitCall(*calleeFnInfo, callee, returnSlot, callArgs);
 
   // If necessary, copy the returned value into the slot.
   if (!resultType->isVoidType() && returnSlot.isNull()) {
@@ -2996,7 +2998,15 @@ void CodeGenFunction::EmitLambdaBlockInvokeBody() {
   EmitForwardingCallToLambda(CallOp, CallArgs);
 }
 
-void CodeGenFunction::EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD) {
+void CodeGenFunction::EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD) {
+  if (MD->isVariadic()) {
+    // FIXME: Making this work correctly is nasty because it requires either
+    // cloning the body of the call operator or making the call operator
+    // forward.
+    CGM.ErrorUnsupported(MD, "lambda conversion to variadic function");
+    return;
+  }
+
   const CXXRecordDecl *Lambda = MD->getParent();
 
   // Start building arguments for forwarding call
@@ -3007,10 +3017,16 @@ void CodeGenFunction::EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD) {
   Address ThisPtr = CreateMemTemp(LambdaType, "unused.capture");
   CallArgs.add(RValue::get(ThisPtr.getPointer()), ThisType);
 
-  // Add the rest of the parameters.
+  EmitLambdaDelegatingInvokeBody(MD, CallArgs);
+}
+
+void CodeGenFunction::EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD,
+                                                     CallArgList &CallArgs) {
+  // Add the rest of the forwarded parameters.
   for (auto *Param : MD->parameters())
     EmitDelegateCallArg(CallArgs, Param, Param->getBeginLoc());
 
+  const CXXRecordDecl *Lambda = MD->getParent();
   const CXXMethodDecl *CallOp = Lambda->getLambdaCallOperator();
   // For a generic lambda, find the corresponding call operator specialization
   // to which the call to the static-invoker shall be forwarded.
@@ -3024,10 +3040,21 @@ void CodeGenFunction::EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD) {
     assert(CorrespondingCallOpSpecialization);
     CallOp = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
   }
+
+  // Special lambda forwarding when there are inalloca parameters.
+  if (hasInAllocaArg(MD)) {
+    const CGFunctionInfo *ImplFnInfo = nullptr;
+    llvm::Function *ImplFn = nullptr;
+    EmitLambdaInAllocaImplFn(CallOp, &ImplFnInfo, &ImplFn);
+
+    EmitForwardingCallToLambda(CallOp, CallArgs, ImplFnInfo, ImplFn);
+    return;
+  }
+
   EmitForwardingCallToLambda(CallOp, CallArgs);
 }
 
-void CodeGenFunction::EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD) {
+void CodeGenFunction::EmitLambdaInAllocaCallOpBody(const CXXMethodDecl *MD) {
   if (MD->isVariadic()) {
     // FIXME: Making this work correctly is nasty because it requires either
     // cloning the body of the call operator or making the call operator forward.
@@ -3035,5 +3062,56 @@ void CodeGenFunction::EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD) {
     return;
   }
 
-  EmitLambdaDelegatingInvokeBody(MD);
+  // Forward %this argument.
+  CallArgList CallArgs;
+  QualType LambdaType = getContext().getRecordType(MD->getParent());
+  QualType ThisType = getContext().getPointerType(LambdaType);
+  llvm::Value *ThisArg = CurFn->getArg(0);
+  CallArgs.add(RValue::get(ThisArg), ThisType);
+
+  EmitLambdaDelegatingInvokeBody(MD, CallArgs);
+}
+
+void CodeGenFunction::EmitLambdaInAllocaImplFn(
+    const CXXMethodDecl *CallOp, const CGFunctionInfo **ImplFnInfo,
+    llvm::Function **ImplFn) {
+  const CGFunctionInfo &FnInfo =
+      CGM.getTypes().arrangeCXXMethodDeclaration(CallOp);
+  llvm::Function *CallOpFn =
+      cast<llvm::Function>(CGM.GetAddrOfFunction(GlobalDecl(CallOp)));
+
+  // Emit function containing the original call op body. __invoke will delegate
+  // to this function.
+  SmallVector<CanQualType, 4> ArgTypes;
+  for (auto I = FnInfo.arg_begin(); I != FnInfo.arg_end(); ++I)
+    ArgTypes.push_back(I->type);
+  *ImplFnInfo = &CGM.getTypes().arrangeLLVMFunctionInfo(
+      FnInfo.getReturnType(), FnInfoOpts::IsDelegateCall, ArgTypes,
+      FnInfo.getExtInfo(), {}, FnInfo.getRequiredArgs());
+
+  // Create mangled name as if this was a method named __impl. If for some
+  // reason the name doesn't look as expected then just tack __impl to the
+  // front.
+  // TODO: Use the name mangler to produce the right name instead of using
+  // string replacement.
+  StringRef CallOpName = CallOpFn->getName();
+  std::string ImplName;
+  if (size_t Pos = CallOpName.find_first_of("<lambda"))
+    ImplName = ("?__impl@" + CallOpName.drop_front(Pos)).str();
+  else
+    ImplName = ("__impl" + CallOpName).str();
+
+  llvm::Function *Fn = CallOpFn->getParent()->getFunction(ImplName);
+  if (!Fn) {
+    Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(**ImplFnInfo),
+                                llvm::GlobalValue::InternalLinkage, ImplName,
+                                CGM.getModule());
+    CGM.SetInternalFunctionAttributes(CallOp, Fn, **ImplFnInfo);
+
+    const GlobalDecl &GD = GlobalDecl(CallOp);
+    const auto *D = cast<FunctionDecl>(GD.getDecl());
+    CodeGenFunction(CGM).GenerateCode(GD, Fn, **ImplFnInfo);
+    CGM.SetLLVMFunctionAttributesForDefinition(D, Fn);
+  }
+  *ImplFn = Fn;
 }
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 8437cda79beb..810ae7d51ec1 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -139,6 +139,36 @@ static bool memberCallExpressionCanThrow(const Expr *E) {
   return true;
 }
 
+/// Return true when the coroutine handle may escape from the await-suspend
+/// (`awaiter.await_suspend(std::coroutine_handle)` expression).
+/// Return false only when the coroutine wouldn't escape in the await-suspend
+/// for sure.
+///
+/// While it is always safe to return true, return falses can bring better
+/// performances.
+///
+/// See https://github.com/llvm/llvm-project/issues/56301 and
+/// https://reviews.llvm.org/D157070 for the example and the full discussion.
+///
+/// FIXME: It will be much better to perform such analysis in the middle end.
+/// See the comments in `CodeGenFunction::EmitCall` for example.
+static bool MayCoroHandleEscape(CoroutineSuspendExpr const &S) {
+  CXXRecordDecl *Awaiter =
+      S.getCommonExpr()->getType().getNonReferenceType()->getAsCXXRecordDecl();
+
+  // Return true conservatively if the awaiter type is not a record type.
+  if (!Awaiter)
+    return true;
+
+  // In case the awaiter type is empty, the suspend wouldn't leak the coroutine
+  // handle.
+  //
+  // TODO: We can improve this by looking into the implementation of
+  // await-suspend and see if the coroutine handle is passed to foreign
+  // functions.
+  return !Awaiter->field_empty();
+}
+
 // Emit suspend expression which roughly looks like:
 //
 //   auto && x = CommonExpr();
@@ -199,8 +229,11 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
   auto *SaveCall = Builder.CreateCall(CoroSave, {NullPtr});
 
   CGF.CurCoro.InSuspendBlock = true;
+  CGF.CurCoro.MayCoroHandleEscape = MayCoroHandleEscape(S);
   auto *SuspendRet = CGF.EmitScalarExpr(S.getSuspendExpr());
   CGF.CurCoro.InSuspendBlock = false;
+  CGF.CurCoro.MayCoroHandleEscape = false;
+
   if (SuspendRet != nullptr && SuspendRet->getType()->isIntegerTy(1)) {
     // Veto suspension if requested by bool returning await_suspend.
     BasicBlock *RealSuspendBlock =
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f049a682cfed..d8eb2aecb87a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -391,12 +391,14 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
   SourceManager &SM = CGM.getContext().getSourceManager();
   StringRef FileName;
   FileID FID;
+  std::optional<llvm::DIFile::ChecksumInfo<StringRef>> CSInfo;
 
   if (Loc.isInvalid()) {
     // The DIFile used by the CU is distinct from the main source file. Call
     // createFile() below for canonicalization if the source file was specified
     // with an absolute path.
     FileName = TheCU->getFile()->getFilename();
+    CSInfo = TheCU->getFile()->getChecksum();
   } else {
     PresumedLoc PLoc = SM.getPresumedLoc(Loc);
     FileName = PLoc.getFilename();
@@ -417,13 +419,14 @@ llvm::DIFile *CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
       return cast<llvm::DIFile>(V);
   }
 
+  // Put Checksum at a scope where it will persist past the createFile call.
   SmallString<64> Checksum;
-
-  std::optional<llvm::DIFile::ChecksumKind> CSKind =
+  if (!CSInfo) {
+    std::optional<llvm::DIFile::ChecksumKind> CSKind =
       computeChecksum(FID, Checksum);
-  std::optional<llvm::DIFile::ChecksumInfo<StringRef>> CSInfo;
-  if (CSKind)
-    CSInfo.emplace(*CSKind, Checksum);
+    if (CSKind)
+      CSInfo.emplace(*CSKind, Checksum);
+  }
   return createFile(FileName, CSInfo, getSource(SM, SM.getFileID(Loc)));
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 1fd08626358b..58ee6dd64c4f 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -148,7 +148,7 @@ class CGDebugInfo {
   llvm::BumpPtrAllocator DebugInfoNames;
   StringRef CWDName;
 
-  llvm::StringMap<llvm::TrackingMDRef> DIFileCache;
+  llvm::DenseMap<const char *, llvm::TrackingMDRef> DIFileCache;
   llvm::DenseMap<const FunctionDecl *, llvm::TrackingMDRef> SPCache;
   /// Cache declarations relevant to DW_TAG_imported_declarations (C++
   /// using declarations and global alias variables) that aren't covered
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index b0d6eb05acc2..d99dcdba8e43 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -202,7 +202,7 @@ void CodeGenFunction::EmitVarDecl(const VarDecl &D) {
       return;
 
     llvm::GlobalValue::LinkageTypes Linkage =
-        CGM.getLLVMLinkageVarDefinition(&D, /*IsConstant=*/false);
+        CGM.getLLVMLinkageVarDefinition(&D);
 
     // FIXME: We need to force the emission/use of a guard variable for
     // some variables even if we can constant-evaluate them because
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index be8fb6c274db..a9c88110d6f0 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -279,8 +279,8 @@ llvm::Function *CodeGenFunction::createTLSAtExitStub(
   }
 
   const CGFunctionInfo &FI = CGM.getTypes().arrangeLLVMFunctionInfo(
-      getContext().IntTy, /*instanceMethod=*/false, /*chainCall=*/false,
-      {getContext().IntTy}, FunctionType::ExtInfo(), {}, RequiredArgs::All);
+      getContext().IntTy, FnInfoOpts::None, {getContext().IntTy},
+      FunctionType::ExtInfo(), {}, RequiredArgs::All);
 
   // Get the stub function type, int(*)(int,...).
   llvm::FunctionType *StubTy =
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ed6095f7cfeb..fc16b3133f73 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2692,8 +2692,7 @@ static LValue EmitGlobalNamedRegister(const VarDecl *VD, CodeGenModule &CGM) {
 /// this context.
 static bool canEmitSpuriousReferenceToVariable(CodeGenFunction &CGF,
                                                const DeclRefExpr *E,
-                                               const VarDecl *VD,
-                                               bool IsConstant) {
+                                               const VarDecl *VD) {
   // For a variable declared in an enclosing scope, do not emit a spurious
   // reference even if we have a capture, as that will emit an unwarranted
   // reference to our capture state, and will likely generate worse code than
@@ -2726,7 +2725,7 @@ static bool canEmitSpuriousReferenceToVariable(CodeGenFunction &CGF,
   // We can emit a spurious reference only if the linkage implies that we'll
   // be emitting a non-interposable symbol that will be retained until link
   // time.
-  switch (CGF.CGM.getLLVMLinkageVarDefinition(VD, IsConstant)) {
+  switch (CGF.CGM.getLLVMLinkageVarDefinition(VD)) {
   case llvm::GlobalValue::ExternalLinkage:
   case llvm::GlobalValue::LinkOnceODRLinkage:
   case llvm::GlobalValue::WeakODRLinkage:
@@ -2757,7 +2756,7 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
     // constant value directly instead.
     if (E->isNonOdrUse() == NOUR_Constant &&
         (VD->getType()->isReferenceType() ||
-         !canEmitSpuriousReferenceToVariable(*this, E, VD, true))) {
+         !canEmitSpuriousReferenceToVariable(*this, E, VD))) {
       VD->getAnyInitializer(VD);
       llvm::Constant *Val = ConstantEmitter(*this).emitAbstract(
           E->getLocation(), *VD->evaluateValue(), VD->getType());
@@ -2859,7 +2858,7 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
     // some reason; most likely, because it's in an outer function.
     } else if (VD->isStaticLocal()) {
       llvm::Constant *var = CGM.getOrCreateStaticVarDecl(
-          *VD, CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false));
+          *VD, CGM.getLLVMLinkageVarDefinition(VD));
       addr = Address(
           var, ConvertTypeForMem(VD->getType()), getContext().getDeclAlign(VD));
 
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 353ee56839f3..942daa4aa577 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1918,7 +1918,7 @@ ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
 
         if (VD->isLocalVarDecl()) {
           return CGM.getOrCreateStaticVarDecl(
-              *VD, CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false));
+              *VD, CGM.getLLVMLinkageVarDefinition(VD));
         }
       }
     }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a52ec8909b12..124eade4617f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1667,7 +1667,7 @@ Address CGOpenMPRuntime::getAddrOfDeclareTargetVar(const VarDecl *VD) {
   auto AddrOfGlobal = [&VD, this]() { return CGM.GetAddrOfGlobal(VD); };
 
   auto LinkageForVariable = [&VD, this]() {
-    return CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false);
+    return CGM.getLLVMLinkageVarDefinition(VD);
   };
 
   std::vector<llvm::GlobalVariable *> GeneratedRefs;
@@ -10151,6 +10151,13 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
 
   std::optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
       OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
+
+  // If this is an 'extern' declaration we defer to the canonical definition and
+  // do not emit an offloading entry.
+  if (Res && *Res != OMPDeclareTargetDeclAttr::MT_Link &&
+      VD->hasExternalStorage())
+    return;
+
   if (!Res) {
     if (CGM.getLangOpts().OpenMPIsTargetDevice) {
       // Register non-target variables being emitted in device code (debug info
@@ -10163,7 +10170,7 @@ void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
 
   auto AddrOfGlobal = [&VD, this]() { return CGM.GetAddrOfGlobal(VD); };
   auto LinkageForVariable = [&VD, this]() {
-    return CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false);
+    return CGM.getLLVMLinkageVarDefinition(VD);
   };
 
   std::vector<llvm::GlobalVariable *> GeneratedRefs;
diff --git a/clang/lib/CodeGen/CodeGenABITypes.cpp b/clang/lib/CodeGen/CodeGenABITypes.cpp
index d3a16a1d5acc..a6073e1188d6 100644
--- a/clang/lib/CodeGen/CodeGenABITypes.cpp
+++ b/clang/lib/CodeGen/CodeGenABITypes.cpp
@@ -65,9 +65,8 @@ CodeGen::arrangeFreeFunctionCall(CodeGenModule &CGM,
                                  ArrayRef<CanQualType> argTypes,
                                  FunctionType::ExtInfo info,
                                  RequiredArgs args) {
-  return CGM.getTypes().arrangeLLVMFunctionInfo(
-      returnType, /*instanceMethod=*/false, /*chainCall=*/false, argTypes,
-      info, {}, args);
+  return CGM.getTypes().arrangeLLVMFunctionInfo(returnType, FnInfoOpts::None,
+                                                argTypes, info, {}, args);
 }
 
 ImplicitCXXConstructorArgs
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index fab70b66d1d9..7ef893cb1a2d 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -572,7 +572,7 @@ llvm::ConstantInt *
 CodeGenFunction::getUBSanFunctionTypeHash(QualType Ty) const {
   // Remove any (C++17) exception specifications, to allow calling e.g. a
   // noexcept function through a non-noexcept pointer.
-  if (!isa<FunctionNoProtoType>(Ty))
+  if (!Ty->isFunctionNoProtoType())
     Ty = getContext().getFunctionTypeWithExceptionSpec(Ty, EST_None);
   std::string Mangled;
   llvm::raw_string_ostream Out(Mangled);
@@ -683,6 +683,19 @@ static bool matchesStlAllocatorFn(const Decl *D, const ASTContext &Ctx) {
   return true;
 }
 
+bool CodeGenFunction::isInAllocaArgument(CGCXXABI &ABI, QualType Ty) {
+  const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
+  return RD && ABI.getRecordArgABI(RD) == CGCXXABI::RAA_DirectInMemory;
+}
+
+bool CodeGenFunction::hasInAllocaArg(const CXXMethodDecl *MD) {
+  return getTarget().getTriple().getArch() == llvm::Triple::x86 &&
+         getTarget().getCXXABI().isMicrosoft() &&
+         llvm::any_of(MD->parameters(), [&](ParmVarDecl *P) {
+           return isInAllocaArgument(CGM.getCXXABI(), P->getType());
+         });
+}
+
 /// Return the UBSan prologue signature for \p FD if one is available.
 static llvm::Constant *getPrologueSignature(CodeGenModule &CGM,
                                             const FunctionDecl *FD) {
@@ -1447,6 +1460,17 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
     // The lambda static invoker function is special, because it forwards or
     // clones the body of the function call operator (but is actually static).
     EmitLambdaStaticInvokeBody(cast<CXXMethodDecl>(FD));
+  } else if (isa<CXXMethodDecl>(FD) &&
+             isLambdaCallOperator(cast<CXXMethodDecl>(FD)) &&
+             !FnInfo.isDelegateCall() &&
+             cast<CXXMethodDecl>(FD)->getParent()->getLambdaStaticInvoker() &&
+             hasInAllocaArg(cast<CXXMethodDecl>(FD))) {
+    // If emitting a lambda with static invoker on X86 Windows, change
+    // the call operator body.
+    // Make sure that this is a call operator with an inalloca arg and check
+    // for delegate call to make sure this is the original call op and not the
+    // new forwarding function for the static invoker.
+    EmitLambdaInAllocaCallOpBody(cast<CXXMethodDecl>(FD));
   } else if (FD->isDefaulted() && isa<CXXMethodDecl>(FD) &&
              (cast<CXXMethodDecl>(FD)->isCopyAssignmentOperator() ||
               cast<CXXMethodDecl>(FD)->isMoveAssignmentOperator())) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 409f48a04906..28ec2b970072 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -334,6 +334,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   struct CGCoroInfo {
     std::unique_ptr<CGCoroData> Data;
     bool InSuspendBlock = false;
+    bool MayCoroHandleEscape = false;
     CGCoroInfo();
     ~CGCoroInfo();
   };
@@ -347,6 +348,10 @@ class CodeGenFunction : public CodeGenTypeCache {
     return isCoroutine() && CurCoro.InSuspendBlock;
   }
 
+  bool mayCoroHandleEscape() const {
+    return isCoroutine() && CurCoro.MayCoroHandleEscape;
+  }
+
   /// CurGD - The GlobalDecl for the current function being compiled.
   GlobalDecl CurGD;
 
@@ -1963,6 +1968,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// Check if the return value of this function requires sanitization.
   bool requiresReturnValueCheck() const;
 
+  bool isInAllocaArgument(CGCXXABI &ABI, QualType Ty);
+  bool hasInAllocaArg(const CXXMethodDecl *MD);
+
   llvm::BasicBlock *TerminateLandingPad = nullptr;
   llvm::BasicBlock *TerminateHandler = nullptr;
   llvm::SmallVector<llvm::BasicBlock *, 2> TrapBBs;
@@ -2227,10 +2235,17 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitBlockWithFallThrough(llvm::BasicBlock *BB, const Stmt *S);
 
   void EmitForwardingCallToLambda(const CXXMethodDecl *LambdaCallOperator,
-                                  CallArgList &CallArgs);
+                                  CallArgList &CallArgs,
+                                  const CGFunctionInfo *CallOpFnInfo = nullptr,
+                                  llvm::Constant *CallOpFn = nullptr);
   void EmitLambdaBlockInvokeBody();
-  void EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD);
   void EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD);
+  void EmitLambdaDelegatingInvokeBody(const CXXMethodDecl *MD,
+                                      CallArgList &CallArgs);
+  void EmitLambdaInAllocaImplFn(const CXXMethodDecl *CallOp,
+                                const CGFunctionInfo **ImplFnInfo,
+                                llvm::Function **ImplFn);
+  void EmitLambdaInAllocaCallOpBody(const CXXMethodDecl *MD);
   void EmitLambdaVLACapture(const VariableArrayType *VAT, LValue LV) {
     EmitStoreThroughLValue(RValue::get(VLASizeMap[VAT->getSizeExpr()]), LV);
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 07a9dec12f6f..a3506df7d4e5 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1974,7 +1974,7 @@ CodeGenModule::getFunctionLinkage(GlobalDecl GD) {
     return llvm::GlobalValue::InternalLinkage;
   }
 
-  return getLLVMLinkageForDeclarator(D, Linkage, /*IsConstantVariable=*/false);
+  return getLLVMLinkageForDeclarator(D, Linkage);
 }
 
 llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) {
@@ -3605,6 +3605,13 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
         // Emit declaration of the must-be-emitted declare target variable.
         if (std::optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
                 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
+
+          // If this variable has external storage and doesn't require special
+          // link handling we defer to its canonical definition.
+          if (VD->hasExternalStorage() &&
+              Res != OMPDeclareTargetDeclAttr::MT_Link)
+            return;
+
           bool UnifiedMemoryEnabled =
               getOpenMPRuntime().hasRequiresUnifiedSharedMemory();
           if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
@@ -3638,6 +3645,7 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
   if (MustBeEmitted(Global) && MayBeEmittedEagerly(Global)) {
     // Emit the definition if it can't be deferred.
     EmitGlobalDefinition(GD);
+    addEmittedDeferredDecl(GD);
     return;
   }
 
@@ -3657,7 +3665,6 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     // The value must be emitted, but cannot be emitted eagerly.
     assert(!MayBeEmittedEagerly(Global));
     addDeferredDeclToEmit(GD);
-    EmittedDeferredDecls[MangledName] = GD;
   } else {
     // Otherwise, remember that we saw a deferred decl with this name.  The
     // first use of the mangled name will cause it to move into
@@ -4397,7 +4404,6 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
       // DeferredDeclsToEmit list, and remove it from DeferredDecls (since we
       // don't need it anymore).
       addDeferredDeclToEmit(DDI->second);
-      EmittedDeferredDecls[DDI->first] = DDI->second;
       DeferredDecls.erase(DDI);
 
       // Otherwise, there are cases we have to worry about where we're
@@ -4678,7 +4684,6 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty,
     // Move the potentially referenced deferred decl to the DeferredDeclsToEmit
     // list, and remove it from DeferredDecls (since we don't need it anymore).
     addDeferredDeclToEmit(DDI->second);
-    EmittedDeferredDecls[DDI->first] = DDI->second;
     DeferredDecls.erase(DDI);
   }
 
@@ -5221,8 +5226,7 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
     AddGlobalAnnotations(D, GV);
 
   // Set the llvm linkage type as appropriate.
-  llvm::GlobalValue::LinkageTypes Linkage =
-      getLLVMLinkageVarDefinition(D, GV->isConstant());
+  llvm::GlobalValue::LinkageTypes Linkage = getLLVMLinkageVarDefinition(D);
 
   // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
   // the device. [...]"
@@ -5415,8 +5419,9 @@ static bool isVarDeclStrongDefinition(const ASTContext &Context,
   return false;
 }
 
-llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageForDeclarator(
-    const DeclaratorDecl *D, GVALinkage Linkage, bool IsConstantVariable) {
+llvm::GlobalValue::LinkageTypes
+CodeGenModule::getLLVMLinkageForDeclarator(const DeclaratorDecl *D,
+                                           GVALinkage Linkage) {
   if (Linkage == GVA_Internal)
     return llvm::Function::InternalLinkage;
 
@@ -5486,10 +5491,10 @@ llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageForDeclarator(
   return llvm::GlobalVariable::ExternalLinkage;
 }
 
-llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageVarDefinition(
-    const VarDecl *VD, bool IsConstant) {
+llvm::GlobalValue::LinkageTypes
+CodeGenModule::getLLVMLinkageVarDefinition(const VarDecl *VD) {
   GVALinkage Linkage = getContext().GetGVALinkageForVariable(VD);
-  return getLLVMLinkageForDeclarator(VD, Linkage, IsConstant);
+  return getLLVMLinkageForDeclarator(VD, Linkage);
 }
 
 /// Replace the uses of a function that was declared with a non-proto type.
@@ -5701,7 +5706,7 @@ void CodeGenModule::EmitAliasDefinition(GlobalDecl GD) {
     Aliasee = GetOrCreateLLVMGlobal(AA->getAliasee(), DeclTy, LangAS::Default,
                                     /*D=*/nullptr);
     if (const auto *VD = dyn_cast<VarDecl>(GD.getDecl()))
-      LT = getLLVMLinkageVarDefinition(VD, D->getType().isConstQualified());
+      LT = getLLVMLinkageVarDefinition(VD);
     else
       LT = getFunctionLinkage(GD);
   }
@@ -6332,8 +6337,7 @@ ConstantAddress CodeGenModule::GetAddrOfGlobalTemporary(
   }
 
   // Create a global variable for this lifetime-extended temporary.
-  llvm::GlobalValue::LinkageTypes Linkage =
-      getLLVMLinkageVarDefinition(VD, Constant);
+  llvm::GlobalValue::LinkageTypes Linkage = getLLVMLinkageVarDefinition(VD);
   if (Linkage == llvm::GlobalVariable::ExternalLinkage) {
     const VarDecl *InitVD;
     if (VD->isStaticDataMember() && VD->getAnyInitializer(InitVD) &&
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 05cb217e2bee..dd97808c7775 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -361,10 +361,19 @@ class CodeGenModule : public CodeGenTypeCache {
   llvm::DenseMap<llvm::StringRef, GlobalDecl> EmittedDeferredDecls;
 
   void addEmittedDeferredDecl(GlobalDecl GD) {
-    if (!llvm::isa<FunctionDecl>(GD.getDecl()))
+    // Reemission is only needed in incremental mode.
+    if (!Context.getLangOpts().IncrementalExtensions)
       return;
-    llvm::GlobalVariable::LinkageTypes L = getFunctionLinkage(GD);
-    if (llvm::GlobalValue::isLinkOnceLinkage(L) ||
+
+    // Assume a linkage by default that does not need reemission.
+    auto L = llvm::GlobalValue::ExternalLinkage;
+    if (llvm::isa<FunctionDecl>(GD.getDecl()))
+      L = getFunctionLinkage(GD);
+    else if (auto *VD = llvm::dyn_cast<VarDecl>(GD.getDecl()))
+      L = getLLVMLinkageVarDefinition(VD);
+
+    if (llvm::GlobalValue::isInternalLinkage(L) ||
+        llvm::GlobalValue::isLinkOnceLinkage(L) ||
         llvm::GlobalValue::isWeakLinkage(L)) {
       EmittedDeferredDecls[getMangledName(GD)] = GD;
     }
@@ -1321,12 +1330,11 @@ class CodeGenModule : public CodeGenTypeCache {
 
   /// Returns LLVM linkage for a declarator.
   llvm::GlobalValue::LinkageTypes
-  getLLVMLinkageForDeclarator(const DeclaratorDecl *D, GVALinkage Linkage,
-                              bool IsConstantVariable);
+  getLLVMLinkageForDeclarator(const DeclaratorDecl *D, GVALinkage Linkage);
 
   /// Returns LLVM linkage for a declarator.
   llvm::GlobalValue::LinkageTypes
-  getLLVMLinkageVarDefinition(const VarDecl *VD, bool IsConstant);
+  getLLVMLinkageVarDefinition(const VarDecl *VD);
 
   /// Emit all the global annotations.
   void EmitGlobalAnnotations();
diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h
index 9088f77b95c3..a0e846d9a751 100644
--- a/clang/lib/CodeGen/CodeGenTypes.h
+++ b/clang/lib/CodeGen/CodeGenTypes.h
@@ -252,13 +252,11 @@ class CodeGenTypes {
   /// this.
   ///
   /// \param argTypes - must all actually be canonical as params
-  const CGFunctionInfo &arrangeLLVMFunctionInfo(CanQualType returnType,
-                                                bool instanceMethod,
-                                                bool chainCall,
-                                                ArrayRef<CanQualType> argTypes,
-                                                FunctionType::ExtInfo info,
-                    ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
-                                                RequiredArgs args);
+  const CGFunctionInfo &arrangeLLVMFunctionInfo(
+      CanQualType returnType, FnInfoOpts opts, ArrayRef<CanQualType> argTypes,
+      FunctionType::ExtInfo info,
+      ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+      RequiredArgs args);
 
   /// Compute a new LLVM record layout object for the given record.
   std::unique_ptr<CGRecordLayout> ComputeRecordLayout(const RecordDecl *D,
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 79a926cb9edd..ede9efb019ce 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2839,7 +2839,7 @@ static bool isThreadWrapperReplaceable(const VarDecl *VD,
 static llvm::GlobalValue::LinkageTypes
 getThreadLocalWrapperLinkage(const VarDecl *VD, CodeGen::CodeGenModule &CGM) {
   llvm::GlobalValue::LinkageTypes VarLinkage =
-      CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false);
+      CGM.getLLVMLinkageVarDefinition(VD);
 
   // For internal linkage variables, we don't need an external or weak wrapper.
   if (llvm::GlobalValue::isLocalLinkage(VarLinkage))
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index a692abaf3b75..a14efbdba76b 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1379,8 +1379,7 @@ llvm::GlobalValue::LinkageTypes MicrosoftCXXABI::getCXXDestructorLinkage(
   case Dtor_Base:
     // The base destructor most closely tracks the user-declared constructor, so
     // we delegate back to the normal declarator case.
-    return CGM.getLLVMLinkageForDeclarator(Dtor, Linkage,
-                                           /*IsConstantVariable=*/false);
+    return CGM.getLLVMLinkageForDeclarator(Dtor, Linkage);
   case Dtor_Complete:
     // The complete destructor is like an inline function, but it may be
     // imported and therefore must be exported as well. This requires changing
diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 6391a8aeaa67..7483bf6d6d1e 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -148,6 +148,13 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
   if (const ConstantArrayType *ATy = getContext().getAsConstantArrayType(Ty)) {
     uint64_t ArraySize = ATy->getSize().getZExtValue();
     QualType EltTy = ATy->getElementType();
+    // Non-zero-length arrays of empty records make the struct ineligible to be
+    // passed via FARs in C++.
+    if (const auto *RTy = EltTy->getAs<RecordType>()) {
+      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getDecl()) &&
+          isEmptyRecord(getContext(), EltTy, true, true))
+        return false;
+    }
     CharUnits EltSize = getContext().getTypeSizeInChars(EltTy);
     for (uint64_t i = 0; i < ArraySize; ++i) {
       if (!detectFARsEligibleStructHelper(EltTy, CurOff, Field1Ty, Field1Off,
@@ -163,7 +170,7 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     // copy constructor are not eligible for the FP calling convention.
     if (getRecordArgABI(Ty, CGT.getCXXABI()))
       return false;
-    if (isEmptyRecord(getContext(), Ty, true))
+    if (isEmptyRecord(getContext(), Ty, true, true))
       return true;
     const RecordDecl *RD = RTy->getDecl();
     // Unions aren't eligible unless they're empty (which is caught above).
@@ -222,6 +229,8 @@ bool LoongArchABIInfo::detectFARsEligibleStruct(
   if (!detectFARsEligibleStructHelper(Ty, CharUnits::Zero(), Field1Ty,
                                       Field1Off, Field2Ty, Field2Off))
     return false;
+  if (!Field1Ty)
+    return false;
   // Not really a candidate if we have a single int but no float.
   if (Field1Ty && !Field2Ty && !Field1Ty->isFloatingPointTy())
     return false;
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index b6d8ae462675..b12c3025f607 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -8,7 +8,6 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
-#include "llvm/TargetParser/RISCVTargetParser.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -152,6 +151,13 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
   if (const ConstantArrayType *ATy = getContext().getAsConstantArrayType(Ty)) {
     uint64_t ArraySize = ATy->getSize().getZExtValue();
     QualType EltTy = ATy->getElementType();
+    // Non-zero-length arrays of empty records make the struct ineligible for
+    // the FP calling convention in C++.
+    if (const auto *RTy = EltTy->getAs<RecordType>()) {
+      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getDecl()) &&
+          isEmptyRecord(getContext(), EltTy, true, true))
+        return false;
+    }
     CharUnits EltSize = getContext().getTypeSizeInChars(EltTy);
     for (uint64_t i = 0; i < ArraySize; ++i) {
       bool Ret = detectFPCCEligibleStructHelper(EltTy, CurOff, Field1Ty,
@@ -168,7 +174,7 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
     // copy constructor are not eligible for the FP calling convention.
     if (getRecordArgABI(Ty, CGT.getCXXABI()))
       return false;
-    if (isEmptyRecord(getContext(), Ty, true))
+    if (isEmptyRecord(getContext(), Ty, true, true))
       return true;
     const RecordDecl *RD = RTy->getDecl();
     // Unions aren't eligible unless they're empty (which is caught above).
@@ -238,6 +244,8 @@ bool RISCVABIInfo::detectFPCCEligibleStruct(QualType Ty, llvm::Type *&Field1Ty,
   NeededArgFPRs = 0;
   bool IsCandidate = detectFPCCEligibleStructHelper(
       Ty, CharUnits::Zero(), Field1Ty, Field1Off, Field2Ty, Field2Off);
+  if (!Field1Ty)
+    return false;
   // Not really a candidate if we have a single int but no float.
   if (Field1Ty && !Field2Ty && !Field1Ty->isFloatingPointTy())
     return false;
@@ -315,11 +323,15 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const {
 
   assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
 
-  const auto *BT = VT->getElementType()->castAs<BuiltinType>();
-  unsigned EltSize = getContext().getTypeSize(BT);
+  auto VScale =
+      getContext().getTargetInfo().getVScaleRange(getContext().getLangOpts());
+  // The MinNumElts is simplified from equation:
+  // NumElts / VScale =
+  //  (EltSize * NumElts / (VScale * RVVBitsPerBlock))
+  //    * (RVVBitsPerBlock / EltSize)
   llvm::ScalableVectorType *ResType =
-        llvm::ScalableVectorType::get(CGT.ConvertType(VT->getElementType()),
-                                      llvm::RISCV::RVVBitsPerBlock / EltSize);
+      llvm::ScalableVectorType::get(CGT.ConvertType(VT->getElementType()),
+                                    VT->getNumElements() / VScale->first);
   return ABIArgInfo::getDirect(ResType);
 }
 
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 31679d899a44..9f5c3258d65c 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -140,7 +140,8 @@ class X86_32ABIInfo : public ABIInfo {
 
   Class classify(QualType Ty) const;
   ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const;
-  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State,
+                                  bool isDelegateCall) const;
 
   /// Updates the number of available free registers, returns
   /// true if any registers were allocated.
@@ -737,8 +738,8 @@ void X86_32ABIInfo::runVectorCallFirstPass(CGFunctionInfo &FI, CCState &State) c
   }
 }
 
-ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
-                                               CCState &State) const {
+ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
+                                               bool isDelegateCall) const {
   // FIXME: Set alignment on indirect arguments.
   bool IsFastCall = State.CC == llvm::CallingConv::X86_FastCall;
   bool IsRegCall = State.CC == llvm::CallingConv::X86_RegCall;
@@ -753,6 +754,12 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
     CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
     if (RAA == CGCXXABI::RAA_Indirect) {
       return getIndirectResult(Ty, false, State);
+    } else if (isDelegateCall) {
+      // Avoid having different alignments on delegate call args by always
+      // setting the alignment to 4, which is what we do for inallocas.
+      ABIArgInfo Res = getIndirectResult(Ty, false, State);
+      Res.setIndirectAlign(CharUnits::fromQuantity(4));
+      return Res;
     } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
       // The field index doesn't matter, we'll fix it up later.
       return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
@@ -940,7 +947,8 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
     if (State.IsPreassigned.test(I))
       continue;
 
-    Args[I].info = classifyArgumentType(Args[I].type, State);
+    Args[I].info =
+        classifyArgumentType(Args[I].type, State, FI.isDelegateCall());
     UsedInAlloca |= (Args[I].info.getKind() == ABIArgInfo::InAlloca);
   }
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 488350169efa..f6ea4d0b4366 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4741,13 +4741,8 @@ Action *Driver::ConstructPhaseAction(
   }
   case phases::Backend: {
     if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
-      types::ID Output;
-      if (Args.hasArg(options::OPT_S))
-        Output = types::TY_LTO_IR;
-      else if (Args.hasArg(options::OPT_ffat_lto_objects))
-        Output = types::TY_PP_Asm;
-      else
-        Output = types::TY_LTO_BC;
+      types::ID Output =
+          Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC;
       return C.MakeAction<BackendJobAction>(Input, Output);
     }
     if (isUsingLTO(/* IsOffload */ true) &&
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index c3ce13f93464..12fe55be9113 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -37,6 +37,8 @@ static const SanitizerMask NeedsUbsanCxxRt =
     SanitizerKind::Vptr | SanitizerKind::CFI;
 static const SanitizerMask NotAllowedWithTrap = SanitizerKind::Vptr;
 static const SanitizerMask NotAllowedWithMinimalRuntime = SanitizerKind::Vptr;
+static const SanitizerMask NotAllowedWithExecuteOnly =
+    SanitizerKind::Function | SanitizerKind::KCFI;
 static const SanitizerMask RequiresPIE =
     SanitizerKind::DataFlow | SanitizerKind::Scudo;
 static const SanitizerMask NeedsUnwindTables =
@@ -141,6 +143,16 @@ static std::string describeSanitizeArg(const llvm::opt::Arg *A,
 /// Sanitizers set.
 static std::string toString(const clang::SanitizerSet &Sanitizers);
 
+/// Return true if an execute-only target disallows data access to code
+/// sections.
+static bool isExecuteOnlyTarget(const llvm::Triple &Triple,
+                                const llvm::opt::ArgList &Args) {
+  if (Triple.isPS5())
+    return true;
+  return Args.hasFlagNoClaim(options::OPT_mexecute_only,
+                             options::OPT_mno_execute_only, false);
+}
+
 static void validateSpecialCaseListFormat(const Driver &D,
                                           std::vector<std::string> &SCLFiles,
                                           unsigned MalformedSCLErrorDiagID,
@@ -395,6 +407,22 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
           DiagnosedKinds |= SanitizerKind::Function;
         }
       }
+      // -fsanitize=function and -fsanitize=kcfi instrument indirect function
+      // calls to load a type hash before the function label. Therefore, an
+      // execute-only target doesn't support the function and kcfi sanitizers.
+      const llvm::Triple &Triple = TC.getTriple();
+      if (isExecuteOnlyTarget(Triple, Args)) {
+        if (SanitizerMask KindsToDiagnose =
+                Add & NotAllowedWithExecuteOnly & ~DiagnosedKinds) {
+          if (DiagnoseErrors) {
+            std::string Desc = describeSanitizeArg(Arg, KindsToDiagnose);
+            D.Diag(diag::err_drv_argument_not_allowed_with)
+                << Desc << Triple.str();
+          }
+          DiagnosedKinds |= KindsToDiagnose;
+        }
+        Add &= ~NotAllowedWithExecuteOnly;
+      }
 
       // FIXME: Make CFI on member function calls compatible with cross-DSO CFI.
       // There are currently two problems:
@@ -457,6 +485,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
       if (MinimalRuntime) {
         Add &= ~NotAllowedWithMinimalRuntime;
       }
+      // NotAllowedWithExecuteOnly is silently discarded on an execute-only
+      // target if implicitly enabled through group expansion.
+      if (isExecuteOnlyTarget(Triple, Args))
+        Add &= ~NotAllowedWithExecuteOnly;
       if (CfiCrossDso)
         Add &= ~SanitizerKind::CFIMFCall;
       Add &= Supported;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index d60fdbc17968..8dafc3d481c2 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -427,6 +427,12 @@ ToolChain::getDefaultUnwindTableLevel(const ArgList &Args) const {
   return UnwindTableLevel::None;
 }
 
+unsigned ToolChain::GetDefaultDwarfVersion() const {
+  // TODO: Remove the RISC-V special case when R_RISCV_SET_ULEB128 linker
+  // support becomes more widely available.
+  return getTriple().isRISCV() ? 4 : 5;
+}
+
 Tool *ToolChain::getClang() const {
   if (!Clang)
     Clang.reset(new tools::Clang(*this, useIntegratedBackend()));
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 6cbb06b9a91f..65925e9ed610 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -127,31 +127,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const llvm::Triple &Triple,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
-  StringRef ArchName;
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+  std::string ArchName;
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     ArchName = A->getValue();
-
-    // Handle -march=native.
-    if (ArchName == "native") {
-      ArchName = llvm::sys::getHostCPUName();
-      if (ArchName == "generic")
-        ArchName = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
-    }
-
-    if (!llvm::LoongArch::isValidArchName(ArchName)) {
-      D.Diag(clang::diag::err_drv_invalid_arch_name) << A->getAsString(Args);
-      return;
-    }
-  }
-
-  // Select a default arch name.
-  if (ArchName.empty())
-    ArchName = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
-
-  if (!ArchName.empty()) {
-    llvm::LoongArch::getArchFeatures(ArchName, Features);
-    llvm::LoongArch::setArch(ArchName);
-  }
+  ArchName = postProcessTargetCPUString(ArchName, Triple);
+  llvm::LoongArch::getArchFeatures(ArchName, Features);
 
   // Select floating-point features determined by -mdouble-float,
   // -msingle-float, -msoft-float and -mfpu.
@@ -196,3 +176,25 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
 }
+
+std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
+                                                  const llvm::Triple &Triple) {
+  std::string CPUString = CPU;
+  if (CPUString == "native") {
+    CPUString = llvm::sys::getHostCPUName();
+    if (CPUString == "generic")
+      CPUString = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+  }
+  if (CPUString.empty())
+    CPUString = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+  return CPUString;
+}
+
+std::string loongarch::getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
+                                             const llvm::Triple &Triple) {
+  std::string CPU;
+  // If we have -march, use that.
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+    CPU = A->getValue();
+  return postProcessTargetCPUString(CPU, Triple);
+}
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.h b/clang/lib/Driver/ToolChains/Arch/LoongArch.h
index 0084474e7ed3..d8280cd836f8 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.h
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.h
@@ -23,6 +23,12 @@ void getLoongArchTargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
 StringRef getLoongArchABI(const Driver &D, const llvm::opt::ArgList &Args,
                           const llvm::Triple &Triple);
+
+std::string postProcessTargetCPUString(const std::string &CPU,
+                                       const llvm::Triple &Triple);
+
+std::string getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
+                                  const llvm::Triple &Triple);
 } // end namespace loongarch
 } // end namespace tools
 } // end namespace driver
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 286bac2e7a2b..4383b8004143 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -118,7 +118,13 @@ std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
 
 void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args,
-                               std::vector<StringRef> &Features) {
+                               std::vector<StringRef> &Features, bool ForAS) {
+  if (ForAS) {
+    // Some target-specific options are only handled in AddX86TargetArgs, which
+    // is not called by ClangAs::ConstructJob. Claim them here.
+    Args.claimAllArgs(options::OPT_mfpmath_EQ);
+  }
+
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
   if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
@@ -267,4 +273,10 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
           << A->getSpelling() << Scope;
     }
   }
+
+  // -mno-gather, -mno-scatter support
+  if (Args.hasArg(options::OPT_mno_gather))
+    Features.push_back("+prefer-no-gather");
+  if (Args.hasArg(options::OPT_mno_scatter))
+    Features.push_back("+prefer-no-scatter");
 }
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.h b/clang/lib/Driver/ToolChains/Arch/X86.h
index e07387f3ece3..762a1fa6f4d5 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.h
+++ b/clang/lib/Driver/ToolChains/Arch/X86.h
@@ -26,7 +26,7 @@ std::string getX86TargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
 
 void getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                           const llvm::opt::ArgList &Args,
-                          std::vector<llvm::StringRef> &Features);
+                          std::vector<llvm::StringRef> &Features, bool ForAS);
 
 } // end namespace x86
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e3fa315ffcb1..47ec36b3a8ff 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1863,15 +1863,10 @@ void Clang::AddLoongArchTargetArgs(const ArgList &Args,
 
   // Handle -mtune.
   if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
-    StringRef TuneCPU = A->getValue();
-    if (TuneCPU == "native") {
-      TuneCPU = llvm::sys::getHostCPUName();
-      if (TuneCPU == "generic")
-        TuneCPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
-    }
+    std::string TuneCPU = A->getValue();
+    TuneCPU = loongarch::postProcessTargetCPUString(TuneCPU, Triple);
     CmdArgs.push_back("-tune-cpu");
     CmdArgs.push_back(Args.MakeArgString(TuneCPU));
-    llvm::LoongArch::setTuneCPU(TuneCPU);
   }
 }
 
@@ -2068,6 +2063,12 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
     } else if (V == "vec-extabi") {
       VecExtabi = true;
       A->claim();
+    } else if (V == "elfv1") {
+      ABIName = "elfv1";
+      A->claim();
+    } else if (V == "elfv2") {
+      ABIName = "elfv2";
+      A->claim();
     } else if (V != "altivec")
       // The ppc64 linux abis are all "altivec" abis by default. Accept and ignore
       // the option if given as we don't have backend support for any targets
@@ -7375,22 +7376,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (SplitLTOUnit)
     CmdArgs.push_back("-fsplit-lto-unit");
 
-  if (Arg *A = Args.getLastArg(options::OPT_ffat_lto_objects,
-                               options::OPT_fno_fat_lto_objects)) {
-    if (IsUsingLTO && A->getOption().matches(options::OPT_ffat_lto_objects)) {
-      assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin);
-      if (!Triple.isOSBinFormatELF()) {
-        D.Diag(diag::err_drv_unsupported_opt_for_target)
-            << A->getAsString(Args) << TC.getTripleString();
-      }
-      CmdArgs.push_back(Args.MakeArgString(
-          Twine("-flto=") + (LTOMode == LTOK_Thin ? "thin" : "full")));
-      CmdArgs.push_back("-flto-unit");
-      CmdArgs.push_back("-ffat-lto-objects");
-      A->render(Args, CmdArgs);
-    }
-  }
-
   if (Arg *A = Args.getLastArg(options::OPT_fglobal_isel,
                                options::OPT_fno_global_isel)) {
     CmdArgs.push_back("-mllvm");
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 358d7565f47c..8766d34eec53 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -474,6 +474,10 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     return std::string(getWebAssemblyTargetCPU(Args));
+
+  case llvm::Triple::loongarch32:
+  case llvm::Triple::loongarch64:
+    return loongarch::getLoongArchTargetCPU(Args, T);
   }
 }
 
@@ -524,7 +528,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     break;
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    x86::getX86TargetFeatures(D, Triple, Args, Features);
+    x86::getX86TargetFeatures(D, Triple, Args, Features, ForAS);
     break;
   case llvm::Triple::hexagon:
     hexagon::getHexagonTargetFeatures(D, Triple, Args, Features);
@@ -617,11 +621,6 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                                 PluginName + Suffix,
                             Plugin);
     CmdArgs.push_back(Args.MakeArgString(Twine(PluginPrefix) + Plugin));
-  } else {
-    // Tell LLD to find and use .llvm.lto section in regular relocatable object
-    // files
-    if (Args.hasArg(options::OPT_ffat_lto_objects))
-      CmdArgs.push_back("--fat-lto-objects");
   }
 
   const char *PluginOptPrefix = IsOSAIX ? "-bplugin_opt:" : "-plugin-opt=";
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index b64fff8b14be..40038dce47d8 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -1874,6 +1874,12 @@ static bool findBiarchMultilibs(const Driver &D,
                         .flag("-m64", /*Disallow=*/true)
                         .flag("-mx32")
                         .makeMultilib();
+  Multilib Alt32sparc = MultilibBuilder()
+                            .gccSuffix("/sparcv8plus")
+                            .includeSuffix("/sparcv8plus")
+                            .flag("-m32")
+                            .flag("-m64", /*Disallow=*/true)
+                            .makeMultilib();
 
   // GCC toolchain for IAMCU doesn't have crtbegin.o, so look for libgcc.a.
   FilterNonExistent NonExistent(
@@ -1885,10 +1891,14 @@ static bool findBiarchMultilibs(const Driver &D,
   const bool IsX32 = TargetTriple.isX32();
   if (TargetTriple.isArch32Bit() && !NonExistent(Alt32))
     Want = WANT64;
+  if (TargetTriple.isArch32Bit() && !NonExistent(Alt32sparc))
+    Want = WANT64;
   else if (TargetTriple.isArch64Bit() && IsX32 && !NonExistent(Altx32))
     Want = WANT64;
   else if (TargetTriple.isArch64Bit() && !IsX32 && !NonExistent(Alt64))
     Want = WANT32;
+  else if (TargetTriple.isArch64Bit() && !NonExistent(Alt32sparc))
+    Want = WANT64;
   else {
     if (TargetTriple.isArch32Bit())
       Want = NeedsBiarchSuffix ? WANT64 : WANT32;
@@ -1919,6 +1929,7 @@ static bool findBiarchMultilibs(const Driver &D,
   Result.Multilibs.push_back(Alt64);
   Result.Multilibs.push_back(Alt32);
   Result.Multilibs.push_back(Altx32);
+  Result.Multilibs.push_back(Alt32sparc);
 
   Result.Multilibs.FilterOut(NonExistent);
 
@@ -1932,7 +1943,8 @@ static bool findBiarchMultilibs(const Driver &D,
 
   if (Result.SelectedMultilibs.back() == Alt64 ||
       Result.SelectedMultilibs.back() == Alt32 ||
-      Result.SelectedMultilibs.back() == Altx32)
+      Result.SelectedMultilibs.back() == Altx32 ||
+      Result.SelectedMultilibs.back() == Alt32sparc)
     Result.BiarchSibling = Default;
 
   return true;
@@ -2215,6 +2227,7 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
     // so we need to find those /usr/gcc/*/lib/gcc libdirs and go with
     // /usr/gcc/<version> as a prefix.
 
+    SmallVector<std::pair<GCCVersion, std::string>, 8> SolarisPrefixes;
     std::string PrefixDir = concat(SysRoot, "/usr/gcc");
     std::error_code EC;
     for (llvm::vfs::directory_iterator LI = D.getVFS().dir_begin(PrefixDir, EC),
@@ -2232,8 +2245,13 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       if (!D.getVFS().exists(CandidateLibPath))
         continue;
 
-      Prefixes.push_back(CandidatePrefix);
+      SolarisPrefixes.emplace_back(
+          std::make_pair(CandidateVersion, CandidatePrefix));
     }
+    // Sort in reverse order so GCCInstallationDetector::init picks the latest.
+    std::sort(SolarisPrefixes.rbegin(), SolarisPrefixes.rend());
+    for (auto p : SolarisPrefixes)
+      Prefixes.emplace_back(p.second);
     return;
   }
 
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 7acc600a6aa4..aed4ab1955b4 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -383,6 +383,10 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
       if (HTC.ShouldLinkCXXStdlib(Args))
         HTC.AddCXXStdlibLibArgs(Args, CmdArgs);
     }
+    const ToolChain::path_list &LibPaths = HTC.getFilePaths();
+    for (const auto &LibPath : LibPaths)
+      CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + LibPath));
+    Args.ClaimAllArgs(options::OPT_L);
     return;
   }
 
@@ -441,6 +445,7 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
   const ToolChain::path_list &LibPaths = HTC.getFilePaths();
   for (const auto &LibPath : LibPaths)
     CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + LibPath));
+  Args.ClaimAllArgs(options::OPT_L);
 
   //----------------------------------------------------------------------------
   //
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 335a5a88cdfa..de5a69e4ca3f 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -47,11 +47,24 @@ void solaris::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                          Exec, CmdArgs, Inputs, Output));
 }
 
+static bool getPIE(const ArgList &Args, const ToolChain &TC) {
+  if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_static) ||
+      Args.hasArg(options::OPT_r))
+    return false;
+
+  Arg *A = Args.getLastArg(options::OPT_pie, options::OPT_no_pie,
+                           options::OPT_nopie);
+  if (!A)
+    return TC.isPIEDefault(Args);
+  return A->getOption().matches(options::OPT_pie);
+}
+
 void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
+  const bool IsPIE = getPIE(Args, getToolChain());
   ArgStringList CmdArgs;
 
   // Demangle C++ names in errors
@@ -62,6 +75,11 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("_start");
   }
 
+  if (IsPIE) {
+    CmdArgs.push_back("-z");
+    CmdArgs.push_back("type=pie");
+  }
+
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
@@ -113,8 +131,13 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       values_xpg = "values-xpg4.o";
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath(values_xpg)));
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
+
+    const char *crtbegin = nullptr;
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      crtbegin = "crtbeginS.o";
+    else
+      crtbegin = "crtbegin.o";
+    CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(crtbegin)));
     // Add crtfastmath.o if available and fast math is enabled.
     getToolChain().addFastMathRuntimeIfAvailable(Args, CmdArgs);
   }
@@ -151,24 +174,32 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-lgcc");
       CmdArgs.push_back("-lm");
     }
+    const SanitizerArgs &SA = getToolChain().getSanitizerArgs(Args);
     if (NeedsSanitizerDeps) {
       linkSanitizerRuntimeDeps(getToolChain(), CmdArgs);
 
       // Work around Solaris/amd64 ld bug when calling __tls_get_addr directly.
       // However, ld -z relax=transtls is available since Solaris 11.2, but not
       // in Illumos.
-      const SanitizerArgs &SA = getToolChain().getSanitizerArgs(Args);
       if (getToolChain().getTriple().getArch() == llvm::Triple::x86_64 &&
           (SA.needsAsanRt() || SA.needsStatsRt() ||
            (SA.needsUbsanRt() && !SA.requiresMinimalRuntime())))
         CmdArgs.push_back("-zrelax=transtls");
     }
+    // Avoid AsanInitInternal cycle, Issue #64126.
+    if (getToolChain().getTriple().isX86() && SA.needsSharedRt() &&
+        SA.needsAsanRt())
+      CmdArgs.push_back("-znow");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
                    options::OPT_r)) {
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtendS.o")));
+    else
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 32619bc56f7a..852437b9390f 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -581,7 +581,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
           ProbablyBracedList =
               ProbablyBracedList ||
               (NextTok->is(tok::l_brace) && LBraceStack.back().PrevTok &&
-               LBraceStack.back().PrevTok->is(tok::identifier));
+               LBraceStack.back().PrevTok->isOneOf(tok::identifier,
+                                                   tok::greater));
 
           ProbablyBracedList =
               ProbablyBracedList ||
@@ -2464,7 +2465,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         const auto *PrevPrev = Prev ? Prev->getPreviousNonComment() : nullptr;
         const bool Blacklisted =
             PrevPrev &&
-            (PrevPrev->is(tok::kw___attribute) ||
+            (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
              (SeenEqual &&
               (PrevPrev->isOneOf(tok::kw_if, tok::kw_while) ||
                PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h
index e447590393ec..6166317f8f9d 100644
--- a/clang/lib/Headers/__clang_cuda_math.h
+++ b/clang/lib/Headers/__clang_cuda_math.h
@@ -36,7 +36,7 @@
 // because the OpenMP overlay requires constexpr functions here but prior to
 // c++14 void return functions could not be constexpr.
 #pragma push_macro("__DEVICE_VOID__")
-#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
+#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
 #define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE_VOID__ __DEVICE__
diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
index ed576027cb5e..f15198b3d9f9 100644
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -317,7 +317,7 @@ __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
 __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
 __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
 
-#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 560 || 1
+#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 560
 #define __DEPRECATED_SINCE_HIP_560(X) __attribute__((deprecated(X)))
 #else
 #define __DEPRECATED_SINCE_HIP_560(X)
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 454f74e92f85..1ad6853a97c9 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -328,14 +328,4 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
     return 1;
 }
 
-// If MS extensions are enabled, __cpuidex is defined as a builtin which will
-// conflict with the __cpuidex definition below.
-#ifndef _MSC_EXTENSIONS
-static __inline void __cpuidex (int __cpu_info[4], int __leaf, int __subleaf)
-{
-  __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2],
-                __cpu_info[3]);
-}
-#endif
-
 #endif /* __CPUID_H */
diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 3f8d60630de4..2c4dfc9a611e 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -92,12 +92,19 @@ llvm::Error IncrementalExecutor::runCtors() const {
 llvm::Expected<llvm::orc::ExecutorAddr>
 IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
                                       SymbolNameKind NameKind) const {
-  auto Sym = (NameKind == LinkerName) ? Jit->lookupLinkerMangled(Name)
-                                      : Jit->lookup(Name);
-
-  if (!Sym)
-    return Sym.takeError();
-  return Sym;
+  using namespace llvm::orc;
+  auto SO = makeJITDylibSearchOrder({&Jit->getMainJITDylib(),
+                                     Jit->getPlatformJITDylib().get(),
+                                     Jit->getProcessSymbolsJITDylib().get()});
+
+  ExecutionSession &ES = Jit->getExecutionSession();
+
+  auto SymOrErr =
+      ES.lookup(SO, (NameKind == LinkerName) ? ES.intern(Name)
+                                             : Jit->mangleAndIntern(Name));
+  if (auto Err = SymOrErr.takeError())
+    return std::move(Err);
+  return SymOrErr->getAddress();
 }
 
 } // end namespace clang
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 3b9913ac8ba4..a6f50832950c 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -57,6 +57,26 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
   }
 }
 
+static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
+  switch (kind) {
+  default:
+    llvm_unreachable("Unknown token type!");
+  case tok::char_constant:
+  case tok::string_literal:
+    return 0;
+  case tok::utf8_char_constant:
+  case tok::utf8_string_literal:
+    return 2;
+  case tok::wide_char_constant:
+  case tok::wide_string_literal:
+  case tok::utf16_char_constant:
+  case tok::utf16_string_literal:
+  case tok::utf32_char_constant:
+  case tok::utf32_string_literal:
+    return 1;
+  }
+}
+
 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
                                            FullSourceLoc TokLoc,
                                            const char *TokBegin,
@@ -343,7 +363,9 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
     Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
          diag::err_unevaluated_string_invalid_escape_sequence)
         << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
+    HadError = true;
   }
+
   return ResultChar;
 }
 
@@ -1917,9 +1939,22 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
     // Remember if we see any wide or utf-8/16/32 strings.
     // Also check for illegal concatenations.
     if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
-      if (Diags)
-        Diags->Report(Tok.getLocation(), diag::err_unevaluated_string_prefix);
-      hadError = true;
+      if (Diags) {
+        SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
+            Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
+            Features);
+        CharSourceRange Range =
+            CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
+        StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
+                         getEncodingPrefixLen(Tok.getKind()));
+        Diags->Report(Tok.getLocation(),
+                      Features.CPlusPlus26
+                          ? diag::err_unevaluated_string_prefix
+                          : diag::warn_unevaluated_string_prefix)
+            << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
+      }
+      if (Features.CPlusPlus26)
+        hadError = true;
     } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
       if (isOrdinary()) {
         Kind = Tok.getKind();
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index c1e09db2b3ee..d9ff6c42c502 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1016,10 +1016,23 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) {
       return nullptr;
     }
 
-    if (isTokenStringLiteral())
-      AssertMessage = ParseUnevaluatedStringLiteralExpression();
-    else if (getLangOpts().CPlusPlus26)
+    bool ParseAsExpression = false;
+    if (getLangOpts().CPlusPlus26) {
+      for (unsigned I = 0;; ++I) {
+        const Token &T = GetLookAheadToken(I);
+        if (T.is(tok::r_paren))
+          break;
+        if (!tok::isStringLiteral(Tok.getKind())) {
+          ParseAsExpression = true;
+          break;
+        }
+      }
+    }
+
+    if (ParseAsExpression)
       AssertMessage = ParseConstantExpressionInExprEvalContext();
+    else if (tok::isStringLiteral(Tok.getKind()))
+      AssertMessage = ParseUnevaluatedStringLiteralExpression();
     else {
       Diag(Tok, diag::err_expected_string_literal)
           << /*Source='static_assert'*/ 1;
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index b7c83bbeb82e..664337052500 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -62,6 +62,7 @@ bool Parser::isCXXDeclarationStatement(
   case tok::kw_static_assert:
   case tok::kw__Static_assert:
     return true;
+  case tok::coloncolon:
   case tok::identifier: {
     if (DisambiguatingWithExpression) {
       RevertingTentativeParsingAction TPA(*this);
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 05ad42780e50..84c06566387c 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -123,6 +123,18 @@ ShouldDiagnoseAvailabilityInContext(Sema &S, AvailabilityResult K,
                                     const NamedDecl *OffendingDecl) {
   assert(K != AR_Available && "Expected an unavailable declaration here!");
 
+  // If this was defined using CF_OPTIONS, etc. then ignore the diagnostic.
+  auto DeclLoc = Ctx->getBeginLoc();
+  // This is only a problem in Foundation's C++ implementation for CF_OPTIONS.
+  if (DeclLoc.isMacroID() && S.getLangOpts().CPlusPlus &&
+      isa<TypedefDecl>(OffendingDecl)) {
+    StringRef MacroName = S.getPreprocessor().getImmediateMacroName(DeclLoc);
+    if (MacroName == "CF_OPTIONS" || MacroName == "OBJC_OPTIONS" ||
+        MacroName == "SWIFT_OPTIONS" || MacroName == "NS_OPTIONS") {
+      return false;
+    }
+  }
+
   // Checks if we should emit the availability diagnostic in the context of C.
   auto CheckContext = [&](const Decl *C) {
     if (K == AR_NotYetIntroduced) {
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index d65ecf52c523..b338d601db73 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -935,6 +935,14 @@ void CastOperation::CheckDynamicCast() {
           << isClangCL;
   }
 
+  // For a dynamic_cast to a final type, IR generation might emit a reference
+  // to the vtable.
+  if (DestRecord) {
+    auto *DestDecl = DestRecord->getAsCXXRecordDecl();
+    if (DestDecl->isEffectivelyFinal())
+      Self.MarkVTableUsed(OpRange.getBegin(), DestDecl);
+  }
+
   // Done. Everything else is run-time checks.
   Kind = CK_Dynamic;
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index a4bf57928470..21b5781a71cd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9154,7 +9154,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     bool HasPrototype =
         (D.isFunctionDeclarator() && D.getFunctionTypeInfo().hasPrototype) ||
         (D.getDeclSpec().isTypeRep() &&
-         D.getDeclSpec().getRepAsType().get()->isFunctionProtoType()) ||
+         SemaRef.GetTypeFromParser(D.getDeclSpec().getRepAsType(), nullptr)
+             ->isFunctionProtoType()) ||
         (!R->getAsAdjusted<FunctionType>() && R->isFunctionProtoType());
     assert(
         (HasPrototype || !SemaRef.getLangOpts().requiresStrictPrototypes()) &&
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2716b6677105..3a5e302cc03a 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -13880,56 +13880,6 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
   return InvalidOperands(Loc, LHS, RHS);
 }
 
-// Diagnose cases where the user write a logical and/or but probably meant a
-// bitwise one.  We do this when one of the operands is a non-bool integer and
-// the other is a constant.
-void Sema::diagnoseLogicalInsteadOfBitwise(Expr *Op1, Expr *Op2,
-                                           SourceLocation Loc,
-                                           BinaryOperatorKind Opc) {
-  if (Op1->getType()->isIntegerType() && !Op1->getType()->isBooleanType() &&
-      Op2->getType()->isIntegerType() && !Op2->isValueDependent() &&
-      // Don't warn in macros or template instantiations.
-      !Loc.isMacroID() && !inTemplateInstantiation() &&
-      !Op2->getExprLoc().isMacroID() &&
-      !Op1->getExprLoc().isMacroID()) {
-    bool IsOp1InMacro = Op1->getExprLoc().isMacroID();
-    bool IsOp2InMacro = Op2->getExprLoc().isMacroID();
-
-    // Exclude the specific expression from triggering the warning.
-    if (!(IsOp1InMacro && IsOp2InMacro && Op1->getSourceRange() == Op2->getSourceRange())) {
-    // If the RHS can be constant folded, and if it constant folds to something
-    // that isn't 0 or 1 (which indicate a potential logical operation that
-    // happened to fold to true/false) then warn.
-    // Parens on the RHS are ignored.
-    // If the RHS can be constant folded, and if it constant folds to something
-    // that isn't 0 or 1 (which indicate a potential logical operation that
-    // happened to fold to true/false) then warn.
-    // Parens on the RHS are ignored.
-    Expr::EvalResult EVResult;
-    if (Op2->EvaluateAsInt(EVResult, Context)) {
-      llvm::APSInt Result = EVResult.Val.getInt();
-      if ((getLangOpts().Bool && !Op2->getType()->isBooleanType() &&
-           !Op2->getExprLoc().isMacroID()) ||
-          (Result != 0 && Result != 1)) {
-        Diag(Loc, diag::warn_logical_instead_of_bitwise)
-            << Op2->getSourceRange() << (Opc == BO_LAnd ? "&&" : "||");
-        // Suggest replacing the logical operator with the bitwise version
-        Diag(Loc, diag::note_logical_instead_of_bitwise_change_operator)
-            << (Opc == BO_LAnd ? "&" : "|")
-            << FixItHint::CreateReplacement(
-                   SourceRange(Loc, getLocForEndOfToken(Loc)),
-                   Opc == BO_LAnd ? "&" : "|");
-        if (Opc == BO_LAnd)
-          // Suggest replacing "Foo() && kNonZero" with "Foo()"
-          Diag(Loc, diag::note_logical_instead_of_bitwise_remove_constant)
-              << FixItHint::CreateRemoval(SourceRange(
-                     getLocForEndOfToken(Op1->getEndLoc()), Op2->getEndLoc()));
-      }
-    }
-  }
-      }
-}
-
 // C99 6.5.[13,14]
 inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
@@ -13948,6 +13898,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
     }
   }
 
+  if (EnumConstantInBoolContext)
+    Diag(Loc, diag::warn_enum_constant_in_bool_context);
+
   // WebAssembly tables can't be used with logical operators.
   QualType LHSTy = LHS.get()->getType();
   QualType RHSTy = RHS.get()->getType();
@@ -13958,14 +13911,40 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
     return InvalidOperands(Loc, LHS, RHS);
   }
 
-  if (EnumConstantInBoolContext) {
-    // Warn when converting the enum constant to a boolean
-    Diag(Loc, diag::warn_enum_constant_in_bool_context);
-  } else {
-    // Diagnose cases where the user write a logical and/or but probably meant a
-    // bitwise one.
-    diagnoseLogicalInsteadOfBitwise(LHS.get(), RHS.get(), Loc, Opc);
-    diagnoseLogicalInsteadOfBitwise(RHS.get(), LHS.get(), Loc, Opc);
+  // Diagnose cases where the user write a logical and/or but probably meant a
+  // bitwise one.  We do this when the LHS is a non-bool integer and the RHS
+  // is a constant.
+  if (!EnumConstantInBoolContext && LHS.get()->getType()->isIntegerType() &&
+      !LHS.get()->getType()->isBooleanType() &&
+      RHS.get()->getType()->isIntegerType() && !RHS.get()->isValueDependent() &&
+      // Don't warn in macros or template instantiations.
+      !Loc.isMacroID() && !inTemplateInstantiation()) {
+    // If the RHS can be constant folded, and if it constant folds to something
+    // that isn't 0 or 1 (which indicate a potential logical operation that
+    // happened to fold to true/false) then warn.
+    // Parens on the RHS are ignored.
+    Expr::EvalResult EVResult;
+    if (RHS.get()->EvaluateAsInt(EVResult, Context)) {
+      llvm::APSInt Result = EVResult.Val.getInt();
+      if ((getLangOpts().Bool && !RHS.get()->getType()->isBooleanType() &&
+           !RHS.get()->getExprLoc().isMacroID()) ||
+          (Result != 0 && Result != 1)) {
+        Diag(Loc, diag::warn_logical_instead_of_bitwise)
+            << RHS.get()->getSourceRange() << (Opc == BO_LAnd ? "&&" : "||");
+        // Suggest replacing the logical operator with the bitwise version
+        Diag(Loc, diag::note_logical_instead_of_bitwise_change_operator)
+            << (Opc == BO_LAnd ? "&" : "|")
+            << FixItHint::CreateReplacement(
+                   SourceRange(Loc, getLocForEndOfToken(Loc)),
+                   Opc == BO_LAnd ? "&" : "|");
+        if (Opc == BO_LAnd)
+          // Suggest replacing "Foo() && kNonZero" with "Foo()"
+          Diag(Loc, diag::note_logical_instead_of_bitwise_remove_constant)
+              << FixItHint::CreateRemoval(
+                     SourceRange(getLocForEndOfToken(LHS.get()->getEndLoc()),
+                                 RHS.get()->getEndLoc()));
+      }
+    }
   }
 
   if (!Context.getLangOpts().CPlusPlus) {
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index c4f4edb6666c..d1ff688c2a21 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -513,42 +513,21 @@ void LookupResult::resolveKind() {
   const NamedDecl *HasNonFunction = nullptr;
 
   llvm::SmallVector<const NamedDecl *, 4> EquivalentNonFunctions;
-  llvm::BitVector RemovedDecls(N);
 
-  for (unsigned I = 0; I < N; I++) {
+  unsigned UniqueTagIndex = 0;
+
+  unsigned I = 0;
+  while (I < N) {
     const NamedDecl *D = Decls[I]->getUnderlyingDecl();
     D = cast<NamedDecl>(D->getCanonicalDecl());
 
     // Ignore an invalid declaration unless it's the only one left.
     // Also ignore HLSLBufferDecl which not have name conflict with other Decls.
-    if ((D->isInvalidDecl() || isa<HLSLBufferDecl>(D)) &&
-        N - RemovedDecls.count() > 1) {
-      RemovedDecls.set(I);
+    if ((D->isInvalidDecl() || isa<HLSLBufferDecl>(D)) && !(I == 0 && N == 1)) {
+      Decls[I] = Decls[--N];
       continue;
     }
 
-    // C++ [basic.scope.hiding]p2:
-    //   A class name or enumeration name can be hidden by the name of
-    //   an object, function, or enumerator declared in the same
-    //   scope. If a class or enumeration name and an object, function,
-    //   or enumerator are declared in the same scope (in any order)
-    //   with the same name, the class or enumeration name is hidden
-    //   wherever the object, function, or enumerator name is visible.
-    if (HideTags && isa<TagDecl>(D)) {
-      bool Hidden = false;
-      for (auto *OtherDecl : Decls) {
-        if (canHideTag(OtherDecl) &&
-            getContextForScopeMatching(OtherDecl)->Equals(
-                getContextForScopeMatching(Decls[I]))) {
-          RemovedDecls.set(I);
-          Hidden = true;
-          break;
-        }
-      }
-      if (Hidden)
-        continue;
-    }
-
     std::optional<unsigned> ExistingI;
 
     // Redeclarations of types via typedef can occur both within a scope
@@ -581,7 +560,7 @@ void LookupResult::resolveKind() {
       if (isPreferredLookupResult(getSema(), getLookupKind(), Decls[I],
                                   Decls[*ExistingI]))
         Decls[*ExistingI] = Decls[I];
-      RemovedDecls.set(I);
+      Decls[I] = Decls[--N];
       continue;
     }
 
@@ -592,6 +571,7 @@ void LookupResult::resolveKind() {
     } else if (isa<TagDecl>(D)) {
       if (HasTag)
         Ambiguous = true;
+      UniqueTagIndex = I;
       HasTag = true;
     } else if (isa<FunctionTemplateDecl>(D)) {
       HasFunction = true;
@@ -607,7 +587,7 @@ void LookupResult::resolveKind() {
         if (getSema().isEquivalentInternalLinkageDeclaration(HasNonFunction,
                                                              D)) {
           EquivalentNonFunctions.push_back(D);
-          RemovedDecls.set(I);
+          Decls[I] = Decls[--N];
           continue;
         }
 
@@ -615,6 +595,28 @@ void LookupResult::resolveKind() {
       }
       HasNonFunction = D;
     }
+    I++;
+  }
+
+  // C++ [basic.scope.hiding]p2:
+  //   A class name or enumeration name can be hidden by the name of
+  //   an object, function, or enumerator declared in the same
+  //   scope. If a class or enumeration name and an object, function,
+  //   or enumerator are declared in the same scope (in any order)
+  //   with the same name, the class or enumeration name is hidden
+  //   wherever the object, function, or enumerator name is visible.
+  // But it's still an error if there are distinct tag types found,
+  // even if they're not visible. (ref?)
+  if (N > 1 && HideTags && HasTag && !Ambiguous &&
+      (HasFunction || HasNonFunction || HasUnresolved)) {
+    const NamedDecl *OtherDecl = Decls[UniqueTagIndex ? 0 : N - 1];
+    if (isa<TagDecl>(Decls[UniqueTagIndex]->getUnderlyingDecl()) &&
+        getContextForScopeMatching(Decls[UniqueTagIndex])->Equals(
+            getContextForScopeMatching(OtherDecl)) &&
+        canHideTag(OtherDecl))
+      Decls[UniqueTagIndex] = Decls[--N];
+    else
+      Ambiguous = true;
   }
 
   // FIXME: This diagnostic should really be delayed until we're done with
@@ -623,15 +625,9 @@ void LookupResult::resolveKind() {
     getSema().diagnoseEquivalentInternalLinkageDeclarations(
         getNameLoc(), HasNonFunction, EquivalentNonFunctions);
 
-  // Remove decls by replacing them with decls from the end (which
-  // means that we need to iterate from the end) and then truncating
-  // to the new size.
-  for (int I = RemovedDecls.find_last(); I >= 0; I = RemovedDecls.find_prev(I))
-    Decls[I] = Decls[--N];
   Decls.truncate(N);
 
-  if ((HasNonFunction && (HasFunction || HasUnresolved)) ||
-      (HideTags && HasTag && (HasFunction || HasNonFunction || HasUnresolved)))
+  if (HasNonFunction && (HasFunction || HasUnresolved))
     Ambiguous = true;
 
   if (Ambiguous)
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 10b3587885e3..097e81ea7d45 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7478,6 +7478,10 @@ StmtResult
 TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
                                               bool IsStmtExpr) {
   Sema::CompoundScopeRAII CompoundScope(getSema());
+  Sema::FPFeaturesStateRAII FPSave(getSema());
+  if (S->hasStoredFPFeatures())
+    getSema().resetFPOptions(
+        S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
   const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 10c92f8d2149..c8cbee14be4f 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -181,6 +181,13 @@ namespace clang {
     static void setAnonymousDeclForMerging(ASTReader &Reader, DeclContext *DC,
                                            unsigned Index, NamedDecl *D);
 
+    /// Commit to a primary definition of the class RD, which is known to be
+    /// a definition of the class. We might not have read the definition data
+    /// for it yet. If we haven't then allocate placeholder definition data
+    /// now too.
+    static CXXRecordDecl *getOrFakePrimaryClassDefinition(ASTReader &Reader,
+                                                          CXXRecordDecl *RD);
+
     /// Results from loading a RedeclarableDecl.
     class RedeclarableResult {
       Decl *MergeWith;
@@ -598,7 +605,13 @@ void ASTDeclReader::VisitDecl(Decl *D) {
     auto *LexicalDC = readDeclAs<DeclContext>();
     if (!LexicalDC)
       LexicalDC = SemaDC;
-    DeclContext *MergedSemaDC = Reader.MergedDeclContexts.lookup(SemaDC);
+    // If the context is a class, we might not have actually merged it yet, in
+    // the case where the definition comes from an update record.
+    DeclContext *MergedSemaDC;
+    if (auto *RD = dyn_cast<CXXRecordDecl>(SemaDC))
+      MergedSemaDC = getOrFakePrimaryClassDefinition(Reader, RD);
+    else
+      MergedSemaDC = Reader.MergedDeclContexts.lookup(SemaDC);
     // Avoid calling setLexicalDeclContext() directly because it uses
     // Decl::getASTContext() internally which is unsafe during derialization.
     D->setDeclContextsImpl(MergedSemaDC ? MergedSemaDC : SemaDC, LexicalDC,
@@ -3198,6 +3211,32 @@ uint64_t ASTReader::getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset) {
   return LocalOffset + M.GlobalBitOffset;
 }
 
+CXXRecordDecl *
+ASTDeclReader::getOrFakePrimaryClassDefinition(ASTReader &Reader,
+                                               CXXRecordDecl *RD) {
+  // Try to dig out the definition.
+  auto *DD = RD->DefinitionData;
+  if (!DD)
+    DD = RD->getCanonicalDecl()->DefinitionData;
+
+  // If there's no definition yet, then DC's definition is added by an update
+  // record, but we've not yet loaded that update record. In this case, we
+  // commit to DC being the canonical definition now, and will fix this when
+  // we load the update record.
+  if (!DD) {
+    DD = new (Reader.getContext()) struct CXXRecordDecl::DefinitionData(RD);
+    RD->setCompleteDefinition(true);
+    RD->DefinitionData = DD;
+    RD->getCanonicalDecl()->DefinitionData = DD;
+
+    // Track that we did this horrible thing so that we can fix it later.
+    Reader.PendingFakeDefinitionData.insert(
+        std::make_pair(DD, ASTReader::PendingFakeDefinitionKind::Fake));
+  }
+
+  return DD->Definition;
+}
+
 /// Find the context in which we should search for previous declarations when
 /// looking for declarations to merge.
 DeclContext *ASTDeclReader::getPrimaryContextForMerging(ASTReader &Reader,
@@ -3205,29 +3244,8 @@ DeclContext *ASTDeclReader::getPrimaryContextForMerging(ASTReader &Reader,
   if (auto *ND = dyn_cast<NamespaceDecl>(DC))
     return ND->getOriginalNamespace();
 
-  if (auto *RD = dyn_cast<CXXRecordDecl>(DC)) {
-    // Try to dig out the definition.
-    auto *DD = RD->DefinitionData;
-    if (!DD)
-      DD = RD->getCanonicalDecl()->DefinitionData;
-
-    // If there's no definition yet, then DC's definition is added by an update
-    // record, but we've not yet loaded that update record. In this case, we
-    // commit to DC being the canonical definition now, and will fix this when
-    // we load the update record.
-    if (!DD) {
-      DD = new (Reader.getContext()) struct CXXRecordDecl::DefinitionData(RD);
-      RD->setCompleteDefinition(true);
-      RD->DefinitionData = DD;
-      RD->getCanonicalDecl()->DefinitionData = DD;
-
-      // Track that we did this horrible thing so that we can fix it later.
-      Reader.PendingFakeDefinitionData.insert(
-          std::make_pair(DD, ASTReader::PendingFakeDefinitionKind::Fake));
-    }
-
-    return DD->Definition;
-  }
+  if (auto *RD = dyn_cast<CXXRecordDecl>(DC))
+    return getOrFakePrimaryClassDefinition(Reader, RD);
 
   if (auto *RD = dyn_cast<RecordDecl>(DC))
     return RD->getDefinition();
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 59dbc36d24e8..8dd78152bd68 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -580,7 +580,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 30,
+  static_assert(DeclContext::NumFunctionDeclBits == 31,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -1495,7 +1495,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
 }
 
 void ASTDeclWriter::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
-  static_assert(DeclContext::NumCXXConstructorDeclBits == 21,
+  static_assert(DeclContext::NumCXXConstructorDeclBits == 20,
                 "You need to update the serializer after you change the "
                 "CXXConstructorDeclBits");
 
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index b9b82564b330..5158e99b75e5 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -588,19 +588,34 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, uptr size) {
   return REAL(strncpy)(to, from, size);
 }
 
-INTERCEPTOR(long, strtol, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtol);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtol)(nptr, endptr, base);
-  }
+template <typename Fn>
+static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
+                                     char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
+  if (!flags()->replace_str)
+    return real(nptr, endptr, base);
   char *real_endptr;
-  long result = REAL(strtol)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
+  return res;
 }
 
+#  define INTERCEPTOR_STRTO_BASE(ret_type, func)                             \
+    INTERCEPTOR(ret_type, func, const char *nptr, char **endptr, int base) { \
+      void *ctx;                                                             \
+      ASAN_INTERCEPTOR_ENTER(ctx, func);                                     \
+      ENSURE_ASAN_INITED();                                                  \
+      return StrtolImpl(ctx, REAL(func), nptr, endptr, base);                \
+    }
+
+INTERCEPTOR_STRTO_BASE(long, strtol)
+INTERCEPTOR_STRTO_BASE(long long, strtoll)
+
+#  if SANITIZER_GLIBC
+INTERCEPTOR_STRTO_BASE(long, __isoc23_strtol)
+INTERCEPTOR_STRTO_BASE(long long, __isoc23_strtoll)
+#  endif
+
 INTERCEPTOR(int, atoi, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoi);
@@ -639,20 +654,6 @@ INTERCEPTOR(long, atol, const char *nptr) {
   return result;
 }
 
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
-INTERCEPTOR(long long, strtoll, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtoll);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtoll)(nptr, endptr, base);
-  }
-  char *real_endptr;
-  long long result = REAL(strtoll)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
-}
-
 INTERCEPTOR(long long, atoll, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoll);
@@ -666,7 +667,6 @@ INTERCEPTOR(long long, atoll, const char *nptr) {
   ASAN_READ_STRING(ctx, nptr, (real_endptr - nptr) + 1);
   return result;
 }
-#endif  // ASAN_INTERCEPT_ATOLL_AND_STRTOLL
 
 #if ASAN_INTERCEPT___CXA_ATEXIT || ASAN_INTERCEPT_ATEXIT
 static void AtCxaAtexit(void *unused) {
@@ -751,11 +751,13 @@ void InitializeAsanInterceptors() {
 
   ASAN_INTERCEPT_FUNC(atoi);
   ASAN_INTERCEPT_FUNC(atol);
-  ASAN_INTERCEPT_FUNC(strtol);
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
   ASAN_INTERCEPT_FUNC(atoll);
+  ASAN_INTERCEPT_FUNC(strtol);
   ASAN_INTERCEPT_FUNC(strtoll);
-#endif
+#  if SANITIZER_GLIBC
+  ASAN_INTERCEPT_FUNC(__isoc23_strtol);
+  ASAN_INTERCEPT_FUNC(__isoc23_strtoll);
+#  endif
 
   // Intecept jump-related functions.
   ASAN_INTERCEPT_FUNC(longjmp);
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 268096fea5e7..d00d05587b36 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -42,12 +42,10 @@ void InitializePlatformInterceptors();
 // Use macro to describe if specific function should be
 // intercepted on a given platform.
 #if !SANITIZER_WINDOWS
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 1
 # define ASAN_INTERCEPT__LONGJMP 1
 # define ASAN_INTERCEPT_INDEX 1
 # define ASAN_INTERCEPT_PTHREAD_CREATE 1
 #else
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 0
 # define ASAN_INTERCEPT__LONGJMP 0
 # define ASAN_INTERCEPT_INDEX 0
 # define ASAN_INTERCEPT_PTHREAD_CREATE 0
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
index e3a90f18ed81..0fa636bec0d0 100644
--- a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
@@ -65,6 +65,7 @@ INTERCEPT_WRAP_W_W(_expand_dbg)
 
 INTERCEPT_LIBRARY_FUNCTION(atoi);
 INTERCEPT_LIBRARY_FUNCTION(atol);
+INTERCEPT_LIBRARY_FUNCTION(atoll);
 INTERCEPT_LIBRARY_FUNCTION(frexp);
 INTERCEPT_LIBRARY_FUNCTION(longjmp);
 #if SANITIZER_INTERCEPT_MEMCHR
@@ -91,6 +92,7 @@ INTERCEPT_LIBRARY_FUNCTION(strspn);
 INTERCEPT_LIBRARY_FUNCTION(strstr);
 INTERCEPT_LIBRARY_FUNCTION(strtok);
 INTERCEPT_LIBRARY_FUNCTION(strtol);
+INTERCEPT_LIBRARY_FUNCTION(strtoll);
 INTERCEPT_LIBRARY_FUNCTION(wcslen);
 INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
 
diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c
index 54cbda059315..2ac99b25c243 100644
--- a/compiler-rt/lib/builtins/clear_cache.c
+++ b/compiler-rt/lib/builtins/clear_cache.c
@@ -113,7 +113,7 @@ void __clear_cache(void *start, void *end) {
 #elif defined(__linux__) || defined(__OpenBSD__)
     // Pre-R6 may not be globalized. And some implementations may give strange
     // synci_step. So, let's use libc call for it.
-    cacheflush(start, end_int - start_int, BCACHE);
+    _flush_cache(start, end_int - start_int, BCACHE);
 #else
     (void)start_int;
     (void)end_int;
diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
index 36eb696c39ee..0bd7296fb252 100644
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ b/compiler-rt/lib/builtins/cpu_model.c
@@ -751,8 +751,11 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512VP2INTERSECT);
 
+  // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
+  // return all 0s for invalid subleaves so check the limit.
   bool HasLeaf7Subleaf1 =
-      MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+      HasLeaf7 && EAX >= 1 &&
+      !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
   if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
     setFeature(FEATURE_AVX512BF16);
 
diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 078d33b61be3..069f73d276f3 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -181,7 +181,7 @@ const interpose_substitution substitution_##func_name[]             \
 // FreeBSD's dynamic linker (incompliantly) gives non-weak symbols higher
 // priority than weak ones so weak aliases won't work for indirect calls
 // in position-independent (-fPIC / -fPIE) mode.
-#   define __ASM_WEAK_WRAPPER(func)
+#   define __ASM_WEAK_WRAPPER(func) ".globl " #func "\n"
 #  else
 #   define __ASM_WEAK_WRAPPER(func) ".weak " #func "\n"
 #  endif  // SANITIZER_FREEBSD || SANITIZER_NETBSD
diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index f5e0d3cb9a67..ba92bd14d319 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -464,6 +464,25 @@ INTERCEPTORS_STRTO_BASE(long long, wcstoll, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long, wcstoul, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long long, wcstoull, wchar_t)
 
+#if SANITIZER_GLIBC
+INTERCEPTORS_STRTO(double, __isoc23_strtod, char)
+INTERCEPTORS_STRTO(float, __isoc23_strtof, char)
+INTERCEPTORS_STRTO(long double, __isoc23_strtold, char)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_strtol, char)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_strtoll, char)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_strtoul, char)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_strtoull, char)
+INTERCEPTORS_STRTO_BASE(u64, __isoc23_strtouq, char)
+
+INTERCEPTORS_STRTO(double, __isoc23_wcstod, wchar_t)
+INTERCEPTORS_STRTO(float, __isoc23_wcstof, wchar_t)
+INTERCEPTORS_STRTO(long double, __isoc23_wcstold, wchar_t)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_wcstol, wchar_t)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_wcstoll, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_wcstoul, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_wcstoull, wchar_t)
+#endif
+
 #if SANITIZER_NETBSD
 #define INTERCEPT_STRTO(func) \
   INTERCEPT_FUNCTION(func); \
@@ -1748,6 +1767,24 @@ void InitializeInterceptors() {
   INTERCEPT_STRTO(wcstoul);
   INTERCEPT_STRTO(wcstoll);
   INTERCEPT_STRTO(wcstoull);
+#if SANITIZER_GLIBC
+  INTERCEPT_STRTO(__isoc23_strtod);
+  INTERCEPT_STRTO(__isoc23_strtof);
+  INTERCEPT_STRTO(__isoc23_strtold);
+  INTERCEPT_STRTO(__isoc23_strtol);
+  INTERCEPT_STRTO(__isoc23_strtoul);
+  INTERCEPT_STRTO(__isoc23_strtoll);
+  INTERCEPT_STRTO(__isoc23_strtoull);
+  INTERCEPT_STRTO(__isoc23_strtouq);
+  INTERCEPT_STRTO(__isoc23_wcstod);
+  INTERCEPT_STRTO(__isoc23_wcstof);
+  INTERCEPT_STRTO(__isoc23_wcstold);
+  INTERCEPT_STRTO(__isoc23_wcstol);
+  INTERCEPT_STRTO(__isoc23_wcstoul);
+  INTERCEPT_STRTO(__isoc23_wcstoll);
+  INTERCEPT_STRTO(__isoc23_wcstoull);
+#endif
+
 #ifdef SANITIZER_NLDBL_VERSION
   INTERCEPT_FUNCTION_VER(vswprintf, SANITIZER_NLDBL_VERSION);
   INTERCEPT_FUNCTION_VER(swprintf, SANITIZER_NLDBL_VERSION);
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 54e3030d5899..2bd6a49ce065 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -424,10 +424,13 @@ static void createProfileDir(const char *Filename) {
  * its instrumented shared libraries dump profile data into their own data file.
 */
 static FILE *openFileForMerging(const char *ProfileFileName, int *MergeDone) {
-  FILE *ProfileFile = getProfileFile();
+  FILE *ProfileFile = NULL;
   int rc;
 
-  if (!ProfileFile) {
+  ProfileFile = getProfileFile();
+  if (ProfileFile) {
+    lprofLockFileHandle(ProfileFile);
+  } else {
     createProfileDir(ProfileFileName);
     ProfileFile = lprofOpenFileEx(ProfileFileName);
   }
@@ -478,6 +481,9 @@ static int writeFile(const char *OutputName) {
 
   if (OutputFile == getProfileFile()) {
     fflush(OutputFile);
+    if (doMerging()) {
+      lprofUnlockFileHandle(OutputFile);
+    }
   } else {
     fclose(OutputFile);
   }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 299561b3ad3a..0e563fa12022 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1491,6 +1491,16 @@ VSCANF_INTERCEPTOR_IMPL(__isoc99_vsscanf, false, str, format, ap)
 
 INTERCEPTOR(int, __isoc99_vfscanf, void *stream, const char *format, va_list ap)
 VSCANF_INTERCEPTOR_IMPL(__isoc99_vfscanf, false, stream, format, ap)
+
+INTERCEPTOR(int, __isoc23_vscanf, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vscanf, false, format, ap)
+
+INTERCEPTOR(int, __isoc23_vsscanf, const char *str, const char *format,
+            va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vsscanf, false, str, format, ap)
+
+INTERCEPTOR(int, __isoc23_vfscanf, void *stream, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vfscanf, false, stream, format, ap)
 #endif  // SANITIZER_INTERCEPT_ISOC99_SCANF
 
 INTERCEPTOR(int, scanf, const char *format, ...)
@@ -1511,6 +1521,15 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_fscanf, __isoc99_vfscanf, stream, format)
 
 INTERCEPTOR(int, __isoc99_sscanf, const char *str, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
+
+INTERCEPTOR(int, __isoc23_scanf, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_scanf, __isoc23_vscanf, format)
+
+INTERCEPTOR(int, __isoc23_fscanf, void *stream, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_fscanf, __isoc23_vfscanf, stream, format)
+
+INTERCEPTOR(int, __isoc23_sscanf, const char *str, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_sscanf, __isoc23_vsscanf, str, format)
 #endif
 
 #endif
@@ -1534,7 +1553,13 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
   COMMON_INTERCEPT_FUNCTION(__isoc99_fscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vsscanf); \
-  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf);
+  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_scanf);   \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_sscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_fscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vsscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vfscanf);
 #else
 #define INIT_ISOC99_SCANF
 #endif
@@ -3539,30 +3564,26 @@ UNUSED static inline void StrtolFixAndCheck(void *ctx, const char *nptr,
                                  (real_endptr - nptr) + 1 : 0);
 }
 
-
 #if SANITIZER_INTERCEPT_STRTOIMAX
-INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
+template <typename Fn>
+static ALWAYS_INLINE auto StrtoimaxImpl(void *ctx, Fn real, const char *nptr,
+                                        char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
   char *real_endptr;
-  INTMAX_T res = REAL(strtoimax)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
   return res;
 }
 
+INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(strtoimax), nptr, endptr, base);
+}
 INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strtoumax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
-  char *real_endptr;
-  UINTMAX_T res = REAL(strtoumax)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return res;
+  return StrtoimaxImpl(ctx, REAL(strtoumax), nptr, endptr, base);
 }
 
 #define INIT_STRTOIMAX                  \
@@ -3572,6 +3593,25 @@ INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
 #define INIT_STRTOIMAX
 #endif
 
+#if SANITIZER_INTERCEPT_STRTOIMAX && SANITIZER_GLIBC
+INTERCEPTOR(INTMAX_T, __isoc23_strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoimax), nptr, endptr, base);
+}
+INTERCEPTOR(UINTMAX_T, __isoc23_strtoumax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoumax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoumax), nptr, endptr, base);
+}
+
+#  define INIT_STRTOIMAX_C23                       \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoimax); \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoumax);
+#else
+#  define INIT_STRTOIMAX_C23
+#endif
+
 #if SANITIZER_INTERCEPT_MBSTOWCS
 INTERCEPTOR(SIZE_T, mbstowcs, wchar_t *dest, const char *src, SIZE_T len) {
   void *ctx;
@@ -10304,6 +10344,7 @@ static void InitializeCommonInterceptors() {
   INIT_GETCWD;
   INIT_GET_CURRENT_DIR_NAME;
   INIT_STRTOIMAX;
+  INIT_STRTOIMAX_C23;
   INIT_MBSTOWCS;
   INIT_MBSNRTOWCS;
   INIT_WCSTOMBS;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_sparc.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_sparc.cpp
index 1e635a66978f..a2000798a390 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_sparc.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_sparc.cpp
@@ -30,13 +30,7 @@ void BufferedStackTrace::UnwindFast(uptr pc, uptr bp, uptr stack_top,
   // TODO(yln): add arg sanity check for stack_top/stack_bottom
   CHECK_GE(max_depth, 2);
   const uptr kPageSize = GetPageSizeCached();
-#if defined(__GNUC__)
-  // __builtin_return_address returns the address of the call instruction
-  // on the SPARC and not the return address, so we need to compensate.
-  trace_buffer[0] = GetNextInstructionPc(pc);
-#else
   trace_buffer[0] = pc;
-#endif
   size = 1;
   if (stack_top < 4096) return;  // Sanity check for stack top.
   // Flush register windows to memory
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cpp
index 72f025a7d307..6a8e82e2e213 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cpp
@@ -139,13 +139,7 @@ void BufferedStackTrace::UnwindSlow(uptr pc, u32 max_depth) {
   if (to_pop == 0 && size > 1)
     to_pop = 1;
   PopStackFrames(to_pop);
-#if defined(__GNUC__) && defined(__sparc__)
-  // __builtin_return_address returns the address of the call instruction
-  // on the SPARC and not the return address, so we need to compensate.
-  trace_buffer[0] = GetNextInstructionPc(pc);
-#else
   trace_buffer[0] = pc;
-#endif
 }
 
 void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
index 509e3f19fe38..819cfca44b00 100644
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
@@ -34,6 +34,13 @@ __interceptor_pthread_setspecific w
 __interceptor_read w
 __interceptor_realpath w
 __isinf U
+__isoc23_sscanf U
+__isoc23_strtol U
+__isoc23_strtoll U
+__isoc23_strtoll_l U
+__isoc23_strtoull U
+__isoc23_strtoull_l U
+__isoc23_vsscanf U
 __isoc99_sscanf U
 __isoc99_vsscanf U
 __moddi3 U
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index 81514953f24b..75c77ed40527 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -17,6 +17,7 @@
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 27ec0a295f4f..bb62c1ce10c1 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -245,6 +245,9 @@ __handle_replacement_field(_Iterator __begin, _Iterator __end,
   using _CharT = iter_value_t<_Iterator>;
   __format::__parse_number_result __r = __format::__parse_arg_id(__begin, __end, __parse_ctx);
 
+  if (__r.__last == __end)
+    std::__throw_format_error("The argument index should end with a ':' or a '}'");
+
   bool __parse = *__r.__last == _CharT(':');
   switch (*__r.__last) {
   case _CharT(':'):
diff --git a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
index 0e2e91af7d19..5946ed698e0f 100644
--- a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
+++ b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
 
 #include <__config>
+#include <__locale> // for locale_t
 #include <clocale>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__mdspan/layout_left.h b/libcxx/include/__mdspan/layout_left.h
index f890c5ae0256..7503dcf77d13 100644
--- a/libcxx/include/__mdspan/layout_left.h
+++ b/libcxx/include/__mdspan/layout_left.h
@@ -164,7 +164,7 @@ class layout_left::mapping {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __r < extents_type::rank(), "layout_left::mapping::stride(): invalid rank index");
     index_type __s = 1;
-    for (rank_type __i = extents_type::rank() - 1; __i > __r; __i--)
+    for (rank_type __i = 0; __i < __r; __i++)
       __s *= __extents_.extent(__i);
     return __s;
   }
diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module
new file mode 100644
index 000000000000..4d02336d30b0
--- /dev/null
+++ b/libcxx/include/__std_clang_module
@@ -0,0 +1,226 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// WARNING, this entire header is generated by
+// utils/generate_std_clang_module_header.py
+// DO NOT MODIFY!
+
+// This header should not be directly included, it's exclusively to import all
+// of the libc++ public clang modules for the `std` clang module to export. In
+// other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
+// in Swift to expose all of the libc++ interfaces. This is generally not
+// recommended, however there are some clients that need to import all of libc++
+// without knowing what "all" is.
+#if !__building_module(std)
+#  error "Do not include this header directly, include individual headers instead"
+#endif
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#include <algorithm>
+#include <any>
+#include <array>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <atomic>
+#endif
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <barrier>
+#endif
+#include <bit>
+#include <bitset>
+#include <cassert>
+#include <ccomplex>
+#include <cctype>
+#include <cerrno>
+#include <cfenv>
+#include <cfloat>
+#include <charconv>
+#include <chrono>
+#include <cinttypes>
+#include <ciso646>
+#include <climits>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <clocale>
+#endif
+#include <cmath>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <codecvt>
+#endif
+#include <compare>
+#include <complex.h>
+#include <complex>
+#include <concepts>
+#include <condition_variable>
+#include <coroutine>
+#include <csetjmp>
+#include <csignal>
+#include <cstdarg>
+#include <cstdbool>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctgmath>
+#include <ctime>
+#include <ctype.h>
+#include <cuchar>
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwchar>
+#endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwctype>
+#endif
+#include <deque>
+#include <errno.h>
+#include <exception>
+#include <execution>
+#include <expected>
+#include <experimental/deque>
+#include <experimental/forward_list>
+#include <experimental/iterator>
+#include <experimental/list>
+#include <experimental/map>
+#include <experimental/memory_resource>
+#include <experimental/propagate_const>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <experimental/regex>
+#endif
+#include <experimental/set>
+#include <experimental/simd>
+#include <experimental/string>
+#include <experimental/type_traits>
+#include <experimental/unordered_map>
+#include <experimental/unordered_set>
+#include <experimental/utility>
+#include <experimental/vector>
+#include <fenv.h>
+#include <filesystem>
+#include <float.h>
+#include <format>
+#include <forward_list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <fstream>
+#endif
+#include <functional>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <future>
+#endif
+#include <initializer_list>
+#include <inttypes.h>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iomanip>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ios>
+#endif
+#include <iosfwd>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iostream>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <istream>
+#endif
+#include <iterator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <latch>
+#endif
+#include <limits.h>
+#include <limits>
+#include <list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale.h>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale>
+#endif
+#include <map>
+#include <math.h>
+#include <mdspan>
+#include <memory>
+#include <memory_resource>
+#include <mutex>
+#include <new>
+#include <numbers>
+#include <numeric>
+#include <optional>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ostream>
+#endif
+#include <print>
+#include <queue>
+#include <random>
+#include <ranges>
+#include <ratio>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <regex>
+#endif
+#include <scoped_allocator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <semaphore>
+#endif
+#include <set>
+#include <setjmp.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <shared_mutex>
+#endif
+#include <source_location>
+#include <span>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <sstream>
+#endif
+#include <stack>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <stdatomic.h>
+#endif
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdexcept>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <stop_token>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <streambuf>
+#endif
+#include <string.h>
+#include <string>
+#include <string_view>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <strstream>
+#endif
+#include <system_error>
+#include <tgmath.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <thread>
+#endif
+#include <tuple>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <uchar.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+#include <variant>
+#include <vector>
+#include <version>
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <wchar.h>
+#endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <wctype.h>
+#endif
diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h
index d4686d89fd96..4949062433b7 100644
--- a/libcxx/include/__type_traits/is_nothrow_constructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_constructible.h
@@ -22,7 +22,8 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__is_nothrow_constructible)
+// GCC is disabled due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611
+#if __has_builtin(__is_nothrow_constructible) && !defined(_LIBCPP_COMPILER_GCC)
 
 template < class _Tp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS is_nothrow_constructible
diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h
index 8fe8fb0e4959..c4bf612794bd 100644
--- a/libcxx/include/__type_traits/remove_cv.h
+++ b/libcxx/include/__type_traits/remove_cv.h
@@ -19,7 +19,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__remove_cv)
+#if __has_builtin(__remove_cv) && !defined(_LIBCPP_COMPILER_GCC)
 template <class _Tp>
 struct remove_cv {
   using type _LIBCPP_NODEBUG = __remove_cv(_Tp);
diff --git a/libcxx/include/__type_traits/remove_cvref.h b/libcxx/include/__type_traits/remove_cvref.h
index 4dc950ac31ad..e8e8745ab096 100644
--- a/libcxx/include/__type_traits/remove_cvref.h
+++ b/libcxx/include/__type_traits/remove_cvref.h
@@ -20,7 +20,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__remove_cvref)
+#if __has_builtin(__remove_cvref) && !defined(_LIBCPP_COMPILER_GCC)
 template <class _Tp>
 using __remove_cvref_t _LIBCPP_NODEBUG = __remove_cvref(_Tp);
 #else
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 0b418d2b7897..37a9edcd7ece 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -16,7 +16,6 @@ module std_atomic [system] {
   export *
 }
 module std_barrier [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "barrier"
   export *
 }
@@ -37,7 +36,6 @@ module std_chrono [system] {
   export *
 }
 module std_codecvt [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "codecvt"
   export *
 }
@@ -78,7 +76,6 @@ module std_expected [system] {
   export *
 }
 module std_filesystem [system] {
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "filesystem"
   export *
 }
@@ -91,8 +88,6 @@ module std_forward_list [system] {
   export *
 }
 module std_fstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "fstream"
   export *
 }
@@ -101,7 +96,6 @@ module std_functional [system] {
   export *
 }
 module std_future [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "future"
   export *
 }
@@ -110,12 +104,10 @@ module std_initializer_list [system] {
   export *
 }
 module std_iomanip [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iomanip"
   export *
 }
 module std_ios [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ios"
   export *
 }
@@ -124,12 +116,10 @@ module std_iosfwd [system] {
   export *
 }
 module std_iostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iostream"
   export *
 }
 module std_istream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "istream"
   export *
 }
@@ -138,7 +128,6 @@ module std_iterator [system] {
   export *
 }
 module std_latch [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "latch"
   export *
 }
@@ -151,7 +140,6 @@ module std_list [system] {
   export *
 }
 module std_locale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale"
   export *
 }
@@ -192,7 +180,6 @@ module std_optional [system] {
   export *
 }
 module std_ostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ostream"
   export *
 }
@@ -217,7 +204,6 @@ module std_ratio [system] {
   export *
 }
 module std_regex [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "regex"
   export *
 }
@@ -226,7 +212,6 @@ module std_scoped_allocator [system] {
   export *
 }
 module std_semaphore [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "semaphore"
   export *
 }
@@ -235,7 +220,6 @@ module std_set [system] {
   export *
 }
 module std_shared_mutex [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "shared_mutex"
   export std_version
 }
@@ -250,7 +234,6 @@ module std_span [system] {
   export std_private_span_span_fwd
 }
 module std_sstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "sstream"
   export *
 }
@@ -263,12 +246,10 @@ module std_stdexcept [system] {
   export *
 }
 module std_stop_token {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "stop_token"
   export *
 }
 module std_streambuf [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "streambuf"
   export *
 }
@@ -281,7 +262,6 @@ module std_string_view [system] {
   export *
 }
 module std_strstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "strstream"
   export *
 }
@@ -290,7 +270,6 @@ module std_system_error [system] {
   export *
 }
 module std_thread [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "thread"
   export *
 }
@@ -377,7 +356,6 @@ module std_climits [system] {
   export *
 }
 module std_clocale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "clocale"
   export *
 }
@@ -435,12 +413,10 @@ module std_cuchar [system] {
   export *
 }
 module std_cwchar [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwchar"
   export *
 }
 module std_cwctype [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwctype"
   export *
 }
@@ -477,7 +453,6 @@ module std_limits_h [system] {
   export *
 }
 module std_locale_h [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale.h"
   export *
 }
@@ -493,8 +468,6 @@ module std_setjmp_h [system] {
 // FIXME: <stdalign.h> is missing.
 // <stdarg.h> provided by compiler.
 module std_stdatomic_h [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
-  requires cplusplus23
   header "stdatomic.h"
   export *
 }
@@ -536,21 +509,17 @@ module std_uchar_h [system] {
 }
 // <time.h> provided by C library.
 module std_wchar_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   // <wchar.h>'s __need_* macros require textual inclusion.
   textual header "wchar.h"
   export *
 }
 module std_wctype_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "wctype.h"
   export *
 }
 
 // Experimental C++ standard library interfaces
 module std_experimental [system] {
-  requires cplusplus11
-
   module deque {
     header "experimental/deque"
     export *
@@ -627,6 +596,13 @@ module std_experimental [system] {
   }
 }
 
+// Convenience method to get all of the above modules in a single import statement.
+// Importing only the needed modules is likely to be more performant.
+module std [system] {
+  header "__std_clang_module"
+  export *
+}
+
 // Implementation detail headers that are private to libc++. These modules
 // must not be directly imported.
 module std_private_assert            [system] {
@@ -650,7 +626,6 @@ module std_private_hash_table        [system] {
   export *
 }
 module std_private_locale            [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__locale"
   export *
 }
@@ -819,6 +794,7 @@ module std_private_algorithm_pstl_generate                               [system
 module std_private_algorithm_pstl_is_partitioned                         [system] { header "__algorithm/pstl_is_partitioned.h" }
 module std_private_algorithm_pstl_merge                                  [system] { header "__algorithm/pstl_merge.h" }
 module std_private_algorithm_pstl_replace                                [system] { header "__algorithm/pstl_replace.h" }
+module std_private_algorithm_pstl_sort                                   [system] { header "__algorithm/pstl_sort.h" }
 module std_private_algorithm_pstl_stable_sort                            [system] {
   header "__algorithm/pstl_stable_sort.h"
   export std_private_functional_operations
@@ -1188,7 +1164,6 @@ module std_private_chrono_duration               [system] {
 }
 module std_private_chrono_file_clock             [system] { header "__chrono/file_clock.h" }
 module std_private_chrono_formatter              [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/formatter.h"
 }
 module std_private_chrono_hh_mm_ss               [system] { header "__chrono/hh_mm_ss.h" }
@@ -1202,11 +1177,9 @@ module std_private_chrono_month                  [system] { header "__chrono/mon
 module std_private_chrono_month_weekday          [system] { header "__chrono/month_weekday.h" }
 module std_private_chrono_monthday               [system] { header "__chrono/monthday.h" }
 module std_private_chrono_ostream                [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/ostream.h"
 }
 module std_private_chrono_parser_std_format_spec [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/parser_std_format_spec.h"
 }
 module std_private_chrono_statically_widen       [system] { header "__chrono/statically_widen.h" }
@@ -1530,7 +1503,10 @@ module std_private_memory_allocator_destructor            [system] { header "__m
 module std_private_memory_allocator_traits                [system] { header "__memory/allocator_traits.h" }
 module std_private_memory_assume_aligned                  [system] { header "__memory/assume_aligned.h" }
 module std_private_memory_auto_ptr                        [system] { header "__memory/auto_ptr.h" }
-module std_private_memory_builtin_new_allocator           [system] { header "__memory/builtin_new_allocator.h" }
+module std_private_memory_builtin_new_allocator           [system] {
+  header "__memory/builtin_new_allocator.h"
+  export *
+}
 module std_private_memory_compressed_pair                 [system] { header "__memory/compressed_pair.h" }
 module std_private_memory_concepts                        [system] {
   header "__memory/concepts.h"
@@ -1637,9 +1613,15 @@ module std_private_random_piecewise_linear_distribution   [system] {
   export *
 }
 module std_private_random_poisson_distribution            [system] { header "__random/poisson_distribution.h" }
-module std_private_random_random_device                   [system] { header "__random/random_device.h" }
+module std_private_random_random_device                   [system] {
+  header "__random/random_device.h"
+  export *
+}
 module std_private_random_ranlux                          [system] { header "__random/ranlux.h" }
-module std_private_random_seed_seq                        [system] { header "__random/seed_seq.h" }
+module std_private_random_seed_seq                        [system] {
+  header "__random/seed_seq.h"
+  export *
+}
 module std_private_random_shuffle_order_engine            [system] { header "__random/shuffle_order_engine.h" }
 module std_private_random_student_t_distribution          [system] { header "__random/student_t_distribution.h" }
 module std_private_random_subtract_with_carry_engine      [system] { header "__random/subtract_with_carry_engine.h" }
@@ -1682,7 +1664,6 @@ module std_private_ranges_filter_view                [system] {
 module std_private_ranges_from_range                 [system] { header "__ranges/from_range.h" }
 module std_private_ranges_iota_view                  [system] { header "__ranges/iota_view.h" }
 module std_private_ranges_istream_view               [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__ranges/istream_view.h"
 }
 module std_private_ranges_join_view                  [system] {
@@ -1749,7 +1730,10 @@ module std_private_stop_token_stop_token           [system] {
   export *
 }
 
-module std_private_string_char_traits           [system] { header "__string/char_traits.h" }
+module std_private_string_char_traits           [system] {
+  header "__string/char_traits.h"
+  export *
+}
 module std_private_string_constexpr_c_functions [system] {
   header "__string/constexpr_c_functions.h"
   export std_private_type_traits_is_equality_comparable
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index d7ad0213eb34..40930df24c6d 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -399,8 +399,12 @@ public:
     _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const & { return str(__str_.get_allocator()); }
 
     _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && {
+        string_type __result;
         const basic_string_view<_CharT, _Traits> __view = view();
-        string_type __result(std::move(__str_), __view.data() - __str_.data(), __view.size());
+        if (!__view.empty()) {
+            auto __pos = __view.data() - __str_.data();
+            __result.assign(std::move(__str_), __pos, __view.size());
+        }
         __str_.clear();
         __init_buf_ptrs();
         return __result;
@@ -415,7 +419,7 @@ public:
     }
 
     _LIBCPP_HIDE_FROM_ABI basic_string_view<char_type, traits_type> view() const noexcept;
-#endif
+#endif // _LIBCPP_STD_VER >= 20
 
     void str(const string_type& __s) {
         __str_ = __s;
@@ -900,20 +904,22 @@ public:
         return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
     }
 
-#if _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const & { return __sb_.str(); }
+#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY)
+    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
+#else
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const & { return __sb_.str(); }
 
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); }
+#endif
+
+#if _LIBCPP_STD_VER >= 20
     template <class _SAlloc>
       requires __is_allocator<_SAlloc>::value
     _LIBCPP_HIDE_FROM_ABI basic_string<char_type, traits_type, _SAlloc> str(const _SAlloc& __sa) const {
         return __sb_.str(__sa);
     }
 
-    _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); }
-
     _LIBCPP_HIDE_FROM_ABI basic_string_view<char_type, traits_type> view() const noexcept { return __sb_.view(); }
-#else // _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
 #endif // _LIBCPP_STD_VER >= 20
 
     _LIBCPP_HIDE_FROM_ABI void str(const string_type& __s) { __sb_.str(__s); }
@@ -1023,20 +1029,22 @@ public:
         return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
     }
 
-#if _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const & { return __sb_.str(); }
+#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY)
+    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
+#else
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const & { return __sb_.str(); }
+
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); }
+#endif
 
+#if _LIBCPP_STD_VER >= 20
     template <class _SAlloc>
       requires __is_allocator<_SAlloc>::value
     _LIBCPP_HIDE_FROM_ABI basic_string<char_type, traits_type, _SAlloc> str(const _SAlloc& __sa) const {
         return __sb_.str(__sa);
     }
 
-    _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); }
-
     _LIBCPP_HIDE_FROM_ABI basic_string_view<char_type, traits_type> view() const noexcept { return __sb_.view(); }
-#else // _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
 #endif // _LIBCPP_STD_VER >= 20
 
     _LIBCPP_HIDE_FROM_ABI void str(const string_type& __s) { __sb_.str(__s); }
@@ -1145,20 +1153,22 @@ public:
         return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
     }
 
-#if _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const & { return __sb_.str(); }
+#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY)
+    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
+#else
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const & { return __sb_.str(); }
+
+    _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); }
+#endif
 
+#if _LIBCPP_STD_VER >= 20
     template <class _SAlloc>
       requires __is_allocator<_SAlloc>::value
     _LIBCPP_HIDE_FROM_ABI basic_string<char_type, traits_type, _SAlloc> str(const _SAlloc& __sa) const {
         return __sb_.str(__sa);
     }
 
-    _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); }
-
     _LIBCPP_HIDE_FROM_ABI basic_string_view<char_type, traits_type> view() const noexcept { return __sb_.view(); }
-#else // _LIBCPP_STD_VER >= 20
-    _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); }
 #endif // _LIBCPP_STD_VER >= 20
 
     _LIBCPP_HIDE_FROM_ABI void str(const string_type& __s) { __sb_.str(__s); }
diff --git a/libcxx/modules/std/atomic.cppm b/libcxx/modules/std/atomic.cppm
index 9c1948494bd4..faf902b768d7 100644
--- a/libcxx/modules/std/atomic.cppm
+++ b/libcxx/modules/std/atomic.cppm
@@ -23,9 +23,6 @@ export namespace std {
   using std::memory_order_seq_cst;
 
   using std::kill_dependency;
-} // namespace std
-
-namespace std {
 
   // [atomics.ref.generic], class template atomic_ref
   // [atomics.ref.pointer], partial specialization for pointers
diff --git a/libcxx/modules/std/execution.cppm b/libcxx/modules/std/execution.cppm
index 6ea12c0dc4ee..e0996f33d415 100644
--- a/libcxx/modules/std/execution.cppm
+++ b/libcxx/modules/std/execution.cppm
@@ -17,7 +17,7 @@ export namespace std {
   using std::is_execution_policy_v;
 } // namespace std
 
-namespace std::execution {
+export namespace std::execution {
   // [execpol.seq], sequenced execution policy
   using std::execution::sequenced_policy;
 
diff --git a/libcxx/modules/std/filesystem.cppm b/libcxx/modules/std/filesystem.cppm
index ea8136b4ef9f..c6dac368a1cd 100644
--- a/libcxx/modules/std/filesystem.cppm
+++ b/libcxx/modules/std/filesystem.cppm
@@ -110,11 +110,11 @@ export namespace std::filesystem {
 } // namespace std::filesystem
 
 // [fs.path.hash], hash support
-namespace std {
+export namespace std {
   using std::hash;
 }
 
-namespace std::ranges {
+export namespace std::ranges {
   using std::ranges::enable_borrowed_range;
   using std::ranges::enable_view;
 } // namespace std::ranges
diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp
index 0990d8dc181c..f1596132024c 100644
--- a/libcxx/src/chrono.cpp
+++ b/libcxx/src/chrono.cpp
@@ -31,7 +31,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 
diff --git a/libcxx/src/filesystem/filesystem_clock.cpp b/libcxx/src/filesystem/filesystem_clock.cpp
index d00cdc6df343..fbb19ac68df5 100644
--- a/libcxx/src/filesystem/filesystem_clock.cpp
+++ b/libcxx/src/filesystem/filesystem_clock.cpp
@@ -29,7 +29,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 
diff --git a/libunwind/src/Unwind-EHABI.cpp b/libunwind/src/Unwind-EHABI.cpp
index f387c5d3db4e..05475c6ac1e2 100644
--- a/libunwind/src/Unwind-EHABI.cpp
+++ b/libunwind/src/Unwind-EHABI.cpp
@@ -885,8 +885,11 @@ _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) {
   return result;
 }
 
-static uint64_t ValueAsBitPattern(_Unwind_VRS_DataRepresentation representation,
-                                  void* valuep) {
+// Only used in _LIBUNWIND_TRACE_API, which is a no-op when assertions are
+// disabled.
+[[gnu::unused]] static uint64_t
+ValueAsBitPattern(_Unwind_VRS_DataRepresentation representation,
+                  const void *valuep) {
   uint64_t value = 0;
   switch (representation) {
     case _UVRSD_UINT32:
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 9dc99e573d41..04ddb4682917 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -457,6 +457,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     return R_RISCV_ADD;
   case R_LARCH_32_PCREL:
   case R_LARCH_64_PCREL:
+  case R_LARCH_PCREL20_S2:
     return R_PC;
   case R_LARCH_B16:
   case R_LARCH_B21:
@@ -564,6 +565,12 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, val);
     return;
 
+  case R_LARCH_PCREL20_S2:
+    checkInt(loc, val, 22, rel);
+    checkAlignment(loc, val, 4, rel);
+    write32le(loc, setJ20(read32le(loc), val >> 2));
+    return;
+
   case R_LARCH_B16:
     checkInt(loc, val, 18, rel);
     checkAlignment(loc, val, 4, rel);
diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index 87942c1e9245..3d21edb3453a 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -471,10 +471,14 @@ void PPC::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
     if (insn >> 26 != 31)
       error("unrecognized instruction for IE to LE R_PPC_TLS");
     // addi rT, rT, x@tls --> addi rT, rT, x@tprel@l
-    uint32_t dFormOp = getPPCDFormOp((read32(loc) & 0x000007fe) >> 1);
-    if (dFormOp == 0)
-      error("unrecognized instruction for IE to LE R_PPC_TLS");
-    write32(loc, (dFormOp << 26) | (insn & 0x03ff0000) | lo(val));
+    unsigned secondaryOp = (read32(loc) & 0x000007fe) >> 1;
+    uint32_t dFormOp = getPPCDFormOp(secondaryOp);
+    if (dFormOp == 0) { // Expecting a DS-Form instruction.
+      dFormOp = getPPCDSFormOp(secondaryOp);
+      if (dFormOp == 0)
+        error("unrecognized instruction for IE to LE R_PPC_TLS");
+    }
+    write32(loc, (dFormOp | (insn & 0x03ff0000) | lo(val)));
     break;
   }
   default:
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 36b1d0e3c9be..0b6459f852c0 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -37,6 +37,12 @@ enum XFormOpcd {
   STHX = 407,
   STWX = 151,
   STDX = 149,
+  LHAX = 343,
+  LWAX = 341,
+  LFSX = 535,
+  LFDX = 599,
+  STFSX = 663,
+  STFDX = 727,
   ADD = 266,
 };
 
@@ -49,7 +55,6 @@ enum DFormOpcd {
   LWZ = 32,
   LWZU = 33,
   LFSU = 49,
-  LD = 58,
   LFDU = 51,
   STB = 38,
   STBU = 39,
@@ -59,10 +64,20 @@ enum DFormOpcd {
   STWU = 37,
   STFSU = 53,
   STFDU = 55,
-  STD = 62,
+  LHA = 42,
+  LFS = 48,
+  LFD = 50,
+  STFS = 52,
+  STFD = 54,
   ADDI = 14
 };
 
+enum DSFormOpcd {
+  LD = 58,
+  LWA = 58,
+  STD = 62
+};
+
 constexpr uint32_t NOP = 0x60000000;
 
 enum class PPCLegacyInsn : uint32_t {
@@ -825,26 +840,48 @@ void PPC64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
   }
 }
 
+// Map X-Form instructions to their DS-Form counterparts, if applicable.
+// The full encoding is returned here to distinguish between the different
+// DS-Form instructions.
+unsigned elf::getPPCDSFormOp(unsigned secondaryOp) {
+  switch (secondaryOp) {
+  case LWAX:
+    return (LWA << 26) | 0x2;
+  case LDX:
+    return LD << 26;
+  case STDX:
+    return STD << 26;
+  default:
+    return 0;
+  }
+}
+
 unsigned elf::getPPCDFormOp(unsigned secondaryOp) {
   switch (secondaryOp) {
   case LBZX:
-    return LBZ;
+    return LBZ << 26;
   case LHZX:
-    return LHZ;
+    return LHZ << 26;
   case LWZX:
-    return LWZ;
-  case LDX:
-    return LD;
+    return LWZ << 26;
   case STBX:
-    return STB;
+    return STB << 26;
   case STHX:
-    return STH;
+    return STH << 26;
   case STWX:
-    return STW;
-  case STDX:
-    return STD;
+    return STW << 26;
+  case LHAX:
+    return LHA << 26;
+  case LFSX:
+    return LFS << 26;
+  case LFDX:
+    return LFD << 26;
+  case STFSX:
+    return STFS << 26;
+  case STFDX:
+    return STFD << 26;
   case ADD:
-    return ADDI;
+    return ADDI << 26;
   default:
     return 0;
   }
@@ -898,10 +935,16 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         error("unrecognized instruction for IE to LE R_PPC64_TLS");
       uint32_t secondaryOp = (read32(loc) & 0x000007FE) >> 1; // bits 21-30
       uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-      if (dFormOp == 0)
-        error("unrecognized instruction for IE to LE R_PPC64_TLS");
-      write32(loc, ((dFormOp << 26) | (read32(loc) & 0x03FFFFFF)));
-      relocateNoSym(loc + offset, R_PPC64_TPREL16_LO, val);
+      uint32_t finalReloc;
+      if (dFormOp == 0) { // Expecting a DS-Form instruction.
+        dFormOp = getPPCDSFormOp(secondaryOp);
+        if (dFormOp == 0)
+          error("unrecognized instruction for IE to LE R_PPC64_TLS");
+        finalReloc = R_PPC64_TPREL16_LO_DS;
+      } else
+        finalReloc = R_PPC64_TPREL16_LO;
+      write32(loc, dFormOp | (read32(loc) & 0x03ff0000));
+      relocateNoSym(loc + offset, finalReloc, val);
     } else if (locAsInt % 4 == 1) {
       // If the offset is not 4 byte aligned then we have a PCRel type reloc.
       // This version of the relocation is offset by one byte from the
@@ -926,9 +969,12 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         }
       } else {
         uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-        if (dFormOp == 0)
-          errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
-        write32(loc - 1, ((dFormOp << 26) | (tlsInstr & 0x03FF0000)));
+        if (dFormOp == 0) { // Expecting a DS-Form instruction.
+          dFormOp = getPPCDSFormOp(secondaryOp);
+          if (dFormOp == 0)
+            errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
+        }
+        write32(loc - 1, (dFormOp | (tlsInstr & 0x03ff0000)));
       }
     } else {
       errorOrWarn("R_PPC64_TLS must be either 4 byte aligned or one byte "
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 9d4f22dd93f1..47dbe6b4d1c6 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -207,6 +207,7 @@ void processArmCmseSymbols();
 void writePPC32GlinkSection(uint8_t *buf, size_t numEntries);
 
 unsigned getPPCDFormOp(unsigned secondaryOp);
+unsigned getPPCDSFormOp(unsigned secondaryOp);
 
 // In the PowerPC64 Elf V2 abi a function can have 2 entry points.  The first
 // is a global entry point (GEP) which typically is used to initialize the TOC
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index d8e34d1e6c74..7e2e07d5f11b 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -107,6 +107,11 @@ COFF Improvements
 MinGW Improvements
 ------------------
 
+* A warning is now printed if the linked module contains runtime pseudo
+  relocations that are too narrow, that can end up producing runtime
+  errors if loaded too far away from the referenced DLL in the address
+  space. (`D154777 <https://reviews.llvm.org/D154777>`_)
+
 MachO Improvements
 ------------------
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/GNUstepObjCRuntime/GNUstepObjCRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/GNUstepObjCRuntime/GNUstepObjCRuntime.cpp
index fb2656ef1385..39b3e816f4be 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/GNUstepObjCRuntime/GNUstepObjCRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/GNUstepObjCRuntime/GNUstepObjCRuntime.cpp
@@ -37,6 +37,33 @@ void GNUstepObjCRuntime::Terminate() {
   PluginManager::UnregisterPlugin(CreateInstance);
 }
 
+static bool CanModuleBeGNUstepObjCLibrary(const ModuleSP &module_sp,
+                                          const llvm::Triple &TT) {
+  if (!module_sp)
+    return false;
+  const FileSpec &module_file_spec = module_sp->GetFileSpec();
+  if (!module_file_spec)
+    return false;
+  llvm::StringRef filename = module_file_spec.GetFilename().GetStringRef();
+  if (TT.isOSBinFormatELF())
+    return filename.starts_with("libobjc.so");
+  if (TT.isOSWindows())
+    return filename == "objc.dll";
+  return false;
+}
+
+static bool ScanForGNUstepObjCLibraryCandidate(const ModuleList &modules,
+                                               const llvm::Triple &TT) {
+  std::lock_guard<std::recursive_mutex> guard(modules.GetMutex());
+  size_t num_modules = modules.GetSize();
+  for (size_t i = 0; i < num_modules; i++) {
+    auto mod = modules.GetModuleAtIndex(i);
+    if (CanModuleBeGNUstepObjCLibrary(mod, TT))
+      return true;
+  }
+  return false;
+}
+
 LanguageRuntime *GNUstepObjCRuntime::CreateInstance(Process *process,
                                                     LanguageType language) {
   if (language != eLanguageTypeObjC)
@@ -50,6 +77,9 @@ LanguageRuntime *GNUstepObjCRuntime::CreateInstance(Process *process,
     return nullptr;
 
   const ModuleList &images = target.GetImages();
+  if (!ScanForGNUstepObjCLibraryCandidate(images, TT))
+    return nullptr;
+
   if (TT.isOSBinFormatELF()) {
     SymbolContextList eh_pers;
     RegularExpression regex("__gnustep_objc[x]*_personality_v[0-9]+");
@@ -176,18 +206,8 @@ void GNUstepObjCRuntime::UpdateISAToDescriptorMapIfNeeded() {
 }
 
 bool GNUstepObjCRuntime::IsModuleObjCLibrary(const ModuleSP &module_sp) {
-  if (!module_sp)
-    return false;
-  const FileSpec &module_file_spec = module_sp->GetFileSpec();
-  if (!module_file_spec)
-    return false;
-  llvm::StringRef filename = module_file_spec.GetFilename().GetStringRef();
   const llvm::Triple &TT = GetTargetRef().GetArchitecture().GetTriple();
-  if (TT.isOSBinFormatELF())
-    return filename.starts_with("libobjc.so");
-  if (TT.isOSWindows())
-    return filename == "objc.dll";
-  return false;
+  return CanModuleBeGNUstepObjCLibrary(module_sp, TT);
 }
 
 bool GNUstepObjCRuntime::ReadObjCLibrary(const ModuleSP &module_sp) {
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
index 676e450c4846..d306c818e89f 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
@@ -47,6 +47,10 @@ bool RegisterContextPOSIX_arm64::IsPAuth(unsigned reg) const {
   return m_register_info_up->IsPAuthReg(reg);
 }
 
+bool RegisterContextPOSIX_arm64::IsTLS(unsigned reg) const {
+  return m_register_info_up->IsTLSReg(reg);
+}
+
 RegisterContextPOSIX_arm64::RegisterContextPOSIX_arm64(
     lldb_private::Thread &thread,
     std::unique_ptr<RegisterInfoPOSIX_arm64> register_info)
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
index 7c301599d3af..6a935274fc40 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
@@ -55,6 +55,7 @@ class RegisterContextPOSIX_arm64 : public lldb_private::RegisterContext {
 
   bool IsSVE(unsigned reg) const;
   bool IsPAuth(unsigned reg) const;
+  bool IsTLS(unsigned reg) const;
 
   bool IsSVEZ(unsigned reg) const { return m_register_info_up->IsSVEZReg(reg); }
   bool IsSVEP(unsigned reg) const { return m_register_info_up->IsSVEPReg(reg); }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
index 465d3f5b9f3b..20cfe732c6c2 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
@@ -117,6 +117,7 @@ class RegisterInfoPOSIX_arm64
   bool IsSVEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskSVE); }
   bool IsPAuthEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskPAuth); }
   bool IsMTEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); }
+  bool IsTLSEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); }
 
   bool IsSVEReg(unsigned reg) const;
   bool IsSVEZReg(unsigned reg) const;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
index 22a9996b1a6e..38abd8f8f2b1 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
@@ -34,6 +34,12 @@ RegisterContextCorePOSIX_arm64::Create(Thread &thread, const ArchSpec &arch,
   if (pac_data.GetByteSize() >= sizeof(uint64_t) * 2)
     opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskPAuth);
 
+  DataExtractor tls_data = getRegset(notes, arch.GetTriple(), AARCH64_TLS_Desc);
+  // A valid note will always contain at least one register, "tpidr". It may
+  // expand in future.
+  if (tls_data.GetByteSize() >= sizeof(uint64_t))
+    opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskTLS);
+
   auto register_info_up =
       std::make_unique<RegisterInfoPOSIX_arm64>(arch, opt_regsets);
   return std::unique_ptr<RegisterContextCorePOSIX_arm64>(
@@ -59,6 +65,9 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
   if (m_register_info_up->IsPAuthEnabled())
     m_pac_data = getRegset(notes, target_triple, AARCH64_PAC_Desc);
 
+  if (m_register_info_up->IsTLSEnabled())
+    m_tls_data = getRegset(notes, target_triple, AARCH64_TLS_Desc);
+
   ConfigureRegisterContext();
 }
 
@@ -223,6 +232,11 @@ bool RegisterContextCorePOSIX_arm64::ReadRegister(const RegisterInfo *reg_info,
     assert(offset < m_pac_data.GetByteSize());
     value.SetFromMemoryData(*reg_info, m_pac_data.GetDataStart() + offset,
                             reg_info->byte_size, lldb::eByteOrderLittle, error);
+  } else if (IsTLS(reg)) {
+    offset = reg_info->byte_offset - m_register_info_up->GetTLSOffset();
+    assert(offset < m_tls_data.GetByteSize());
+    value.SetFromMemoryData(*reg_info, m_tls_data.GetDataStart() + offset,
+                            reg_info->byte_size, lldb::eByteOrderLittle, error);
   } else
     return false;
 
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
index f8548562adba..5e0e29f0de7f 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
@@ -57,6 +57,7 @@ class RegisterContextCorePOSIX_arm64 : public RegisterContextPOSIX_arm64 {
   lldb_private::DataExtractor m_fpr_data;
   lldb_private::DataExtractor m_sve_data;
   lldb_private::DataExtractor m_pac_data;
+  lldb_private::DataExtractor m_tls_data;
 
   SVEState m_sve_state;
   uint16_t m_sve_vector_length = 0;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
index f6a2fbdcc938..3d53a5795ef3 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
@@ -123,6 +123,10 @@ constexpr RegsetDesc AARCH64_PAC_Desc[] = {
     {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_PAC_MASK},
 };
 
+constexpr RegsetDesc AARCH64_TLS_Desc[] = {
+    {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_TLS},
+};
+
 constexpr RegsetDesc PPC_VMX_Desc[] = {
     {llvm::Triple::FreeBSD, llvm::Triple::UnknownArch, llvm::ELF::NT_PPC_VMX},
     {llvm::Triple::Linux, llvm::Triple::UnknownArch, llvm::ELF::NT_PPC_VMX},
diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h
index 53de2cb74253..4cf1de488c7b 100644
--- a/llvm/include/llvm/ADT/FunctionExtras.h
+++ b/llvm/include/llvm/ADT/FunctionExtras.h
@@ -59,7 +59,7 @@ namespace detail {
 
 template <typename T>
 using EnableIfTrivial =
-    std::enable_if_t<llvm::is_trivially_move_constructible<T>::value &&
+    std::enable_if_t<std::is_trivially_move_constructible<T>::value &&
                      std::is_trivially_destructible<T>::value>;
 template <typename CallableT, typename ThisT>
 using EnableUnlessSameType =
@@ -99,11 +99,11 @@ template <typename ReturnT, typename... ParamTs> class UniqueFunctionBase {
   template <typename T> struct AdjustedParamTBase {
     static_assert(!std::is_reference<T>::value,
                   "references should be handled by template specialization");
-    using type = std::conditional_t<
-        llvm::is_trivially_copy_constructible<T>::value &&
-            llvm::is_trivially_move_constructible<T>::value &&
-            IsSizeLessThanThresholdT<T>::value,
-        T, T &>;
+    using type =
+        std::conditional_t<std::is_trivially_copy_constructible<T>::value &&
+                               std::is_trivially_move_constructible<T>::value &&
+                               IsSizeLessThanThresholdT<T>::value,
+                           T, T &>;
   };
 
   // This specialization ensures that 'AdjustedParam<V<T>&>' or
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 93d94916745d..53a107b1574c 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -326,8 +326,8 @@ class SmallVectorTemplateCommon
 /// copy these types with memcpy, there is no way for the type to observe this.
 /// This catches the important case of std::pair<POD, POD>, which is not
 /// trivially assignable.
-template <typename T, bool = (is_trivially_copy_constructible<T>::value) &&
-                             (is_trivially_move_constructible<T>::value) &&
+template <typename T, bool = (std::is_trivially_copy_constructible<T>::value) &&
+                             (std::is_trivially_move_constructible<T>::value) &&
                              std::is_trivially_destructible<T>::value>
 class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
   friend class SmallVectorTemplateCommon<T>;
diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h
index 74591ee25ae5..8b04393dd294 100644
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -254,7 +254,9 @@ void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
   if (entry != BB) {
     for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
                                    InvBlockTraits::child_end(BB))) {
-      if (!contains(Pred))
+      // Allow predecessors that are unreachable, as these are ignored during
+      // region analysis.
+      if (!contains(Pred) && DT->isReachableFromEntry(Pred))
         report_fatal_error("Broken region found: edges entering the region must "
                            "go to the entry node!");
     }
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 8cab01c2f11d..ba94852d0dbf 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -124,10 +124,6 @@ bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                             const DominatorTree *DT = nullptr,
                             bool UseInstrInfo = true);
 
-/// Return true if the given instruction is only used in zero comparison
-bool isOnlyUsedInZeroComparison(const Instruction *CxtI);
-
-/// Return true if the given instruction is only used in zero equality comparison
 bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
 
 /// Return true if the given value is known to be non-zero when defined. For
diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index ab1219328a5d..b77bcdb89024 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -579,7 +579,7 @@ void CodeGenPassBuilder<Derived>::addISelPasses(AddIRPass &addPass) const {
   if (TM.useEmulatedTLS())
     addPass(LowerEmuTLSPass());
 
-  addPass(PreISelIntrinsicLoweringPass());
+  addPass(PreISelIntrinsicLoweringPass(TM));
 
   derived().addIRPasses(addPass);
   derived().addCodeGenPrepare(addPass);
diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h
index 2924f475ac85..a1e778cd718f 100644
--- a/llvm/include/llvm/CodeGen/LowLevelType.h
+++ b/llvm/include/llvm/CodeGen/LowLevelType.h
@@ -238,10 +238,9 @@ class LLT {
         return getFieldValue(VectorSizeFieldInfo);
       else
         return getFieldValue(PointerVectorSizeFieldInfo);
-    } else if (IsPointer)
-      return getFieldValue(PointerSizeFieldInfo);
-    else
-      llvm_unreachable("unexpected LLT");
+    }
+    assert(IsPointer && "unexpected LLT");
+    return getFieldValue(PointerSizeFieldInfo);
   }
 
   constexpr unsigned getAddressSpace() const {
diff --git a/llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h b/llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h
index 73d7d779e55b..aa6a0e6935b3 100644
--- a/llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h
+++ b/llvm/include/llvm/CodeGen/PreISelIntrinsicLowering.h
@@ -18,9 +18,13 @@
 namespace llvm {
 
 class Module;
+class TargetMachine;
 
 struct PreISelIntrinsicLoweringPass
     : PassInfoMixin<PreISelIntrinsicLoweringPass> {
+  const TargetMachine &TM;
+
+  PreISelIntrinsicLoweringPass(const TargetMachine &TM) : TM(TM) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 817d32ea0ef6..93dfcfc39924 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1044,16 +1044,6 @@ class TargetInstrInfo : public MCInstrInfo {
     return isCopyInstrImpl(MI);
   }
 
-  bool isFullCopyInstr(const MachineInstr &MI) const {
-    auto DestSrc = isCopyInstr(MI);
-    if (!DestSrc)
-      return false;
-
-    const MachineOperand *DestRegOp = DestSrc->Destination;
-    const MachineOperand *SrcRegOp = DestSrc->Source;
-    return !DestRegOp->getSubReg() && !SrcRegOp->getSubReg();
-  }
-
   /// If the specific machine instruction is an instruction that adds an
   /// immediate value and a physical register, and stores the result in
   /// the given physical register \c Reg, return a pair of the source
@@ -1968,13 +1958,6 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
-  /// Allows targets to use appropriate copy instruction while spilitting live
-  /// range of a register in register allocation.
-  virtual unsigned getLiveRangeSplitOpcode(Register Reg,
-                                           const MachineFunction &MF) const {
-    return TargetOpcode::COPY;
-  }
-
   /// During PHI eleimination lets target to make necessary checks and
   /// insert the copy to the PHI destination register in a target specific
   /// manner.
diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index 8dd8918ddf21..dfab4c68d18f 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -104,12 +104,14 @@ class WasmSymbol {
 struct WasmSection {
   WasmSection() = default;
 
-  uint32_t Type = 0;         // Section type (See below)
-  uint32_t Offset = 0;       // Offset with in the file
+  uint32_t Type = 0;
+  uint32_t Offset = 0;       // Offset within the file
   StringRef Name;            // Section name (User-defined sections only)
   uint32_t Comdat = UINT32_MAX; // From the "comdat info" section
-  ArrayRef<uint8_t> Content; // Section content
-  std::vector<wasm::WasmRelocation> Relocations; // Relocations for this section
+  ArrayRef<uint8_t> Content;
+  std::vector<wasm::WasmRelocation> Relocations;
+  // Length of the LEB encoding of the section header's size field
+  std::optional<uint8_t> HeaderSecSizeEncodingLen;
 };
 
 struct WasmSegment {
diff --git a/llvm/include/llvm/ObjectYAML/WasmYAML.h b/llvm/include/llvm/ObjectYAML/WasmYAML.h
index 0f6c4f06665f..94ecc2fcfdb5 100644
--- a/llvm/include/llvm/ObjectYAML/WasmYAML.h
+++ b/llvm/include/llvm/ObjectYAML/WasmYAML.h
@@ -189,6 +189,7 @@ struct Section {
 
   SectionType Type;
   std::vector<Relocation> Relocations;
+  std::optional<uint8_t> HeaderSecSizeEncodingLen;
 };
 
 struct CustomSection : Section {
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index 310c8900af9e..c57bd2350af1 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -299,6 +299,7 @@ class ArgList {
   /// \p Default if neither option is given. If both the option and its
   /// negation are present, the last one wins.
   bool hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
+  bool hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
 
   /// hasFlag - Given an option \p Pos, an alias \p PosAlias and its negative
   /// form \p Neg, return true if the option or its alias is present, false if
diff --git a/llvm/include/llvm/Support/type_traits.h b/llvm/include/llvm/Support/type_traits.h
index 86f07c19477d..3fd158def34d 100644
--- a/llvm/include/llvm/Support/type_traits.h
+++ b/llvm/include/llvm/Support/type_traits.h
@@ -69,21 +69,6 @@ struct const_pointer_or_const_ref<T, std::enable_if_t<std::is_pointer_v<T>>> {
 };
 
 namespace detail {
-/// Internal utility to detect trivial copy construction.
-template<typename T> union copy_construction_triviality_helper {
-    T t;
-    copy_construction_triviality_helper() = default;
-    copy_construction_triviality_helper(const copy_construction_triviality_helper&) = default;
-    ~copy_construction_triviality_helper() = default;
-};
-/// Internal utility to detect trivial move construction.
-template<typename T> union move_construction_triviality_helper {
-    T t;
-    move_construction_triviality_helper() = default;
-    move_construction_triviality_helper(move_construction_triviality_helper&&) = default;
-    ~move_construction_triviality_helper() = default;
-};
-
 template<class T>
 union trivial_helper {
     T t;
@@ -91,29 +76,6 @@ union trivial_helper {
 
 } // end namespace detail
 
-/// An implementation of `std::is_trivially_copy_constructible` since we have
-/// users with STLs that don't yet include it.
-template <typename T>
-struct is_trivially_copy_constructible
-    : std::is_copy_constructible<
-          ::llvm::detail::copy_construction_triviality_helper<T>> {};
-template <typename T>
-struct is_trivially_copy_constructible<T &> : std::true_type {};
-template <typename T>
-struct is_trivially_copy_constructible<T &&> : std::false_type {};
-
-/// An implementation of `std::is_trivially_move_constructible` since we have
-/// users with STLs that don't yet include it.
-template <typename T>
-struct is_trivially_move_constructible
-    : std::is_move_constructible<
-          ::llvm::detail::move_construction_triviality_helper<T>> {};
-template <typename T>
-struct is_trivially_move_constructible<T &> : std::true_type {};
-template <typename T>
-struct is_trivially_move_constructible<T &&> : std::true_type {};
-
-
 template <typename T>
 struct is_copy_assignable {
   template<class F>
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 82ab064211d7..028844187584 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -66,13 +66,9 @@ struct ArchInfo {
 
 bool isValidArchName(StringRef Arch);
 bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
-bool isValidTuneCPUName(StringRef TuneCPU);
-void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+bool isValidCPUName(StringRef TuneCPU);
+void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
 StringRef getDefaultArch(bool Is64Bit);
-void setArch(StringRef Arch);
-StringRef getArch();
-void setTuneCPU(StringRef TuneCPU);
-StringRef getTuneCPU();
 
 } // namespace LoongArch
 
diff --git a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
index 3568417510f1..2d76546316fa 100644
--- a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
+++ b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
@@ -8,7 +8,7 @@
 /// \file
 ///
 /// AggressiveInstCombiner - Combine expression patterns to form expressions
-/// with fewer, simple instructions.
+/// with fewer, simple instructions. This pass does not modify the CFG.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index d3e5e2591eea..9ce64623e25b 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -20,7 +20,6 @@
 namespace llvm {
 
 class AAResults;
-class AllocaInst;
 class BatchAAResults;
 class AssumptionCache;
 class CallBase;
@@ -78,9 +77,6 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
-  bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                             AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
-                             uint64_t Size, BatchAAResults &BAA);
 
   void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 5d526858e00e..410f93b1c215 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -261,13 +261,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   return KnownBits::haveNoCommonBitsSet(LHSKnown, RHSKnown);
 }
 
-bool llvm::isOnlyUsedInZeroComparison(const Instruction *I) {
-  return !I->user_empty() && all_of(I->users(), [](const User *U) {
-    ICmpInst::Predicate P;
-    return match(U, m_ICmp(P, m_Value(), m_Zero()));
-  });
-}
-
 bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *I) {
   return !I->user_empty() && all_of(I->users(), [](const User *U) {
     ICmpInst::Predicate P;
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 0377bc002067..5a005ba7b414 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -97,7 +97,7 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
     // Trace copies introduced by live range splitting.  The inline
     // spiller can rematerialize through these copies, so the spill
     // weight must reflect this.
-    while (TII.isFullCopyInstr(*MI)) {
+    while (MI->isFullCopy()) {
       // The copy destination must match the interval register.
       if (MI->getOperand(0).getReg() != Reg)
         return false;
@@ -224,16 +224,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       continue;
 
     NumInstr++;
-    bool identityCopy = false;
-    auto DestSrc = TII.isCopyInstr(*MI);
-    if (DestSrc) {
-      const MachineOperand *DestRegOp = DestSrc->Destination;
-      const MachineOperand *SrcRegOp = DestSrc->Source;
-      identityCopy = DestRegOp->getReg() == SrcRegOp->getReg() &&
-                     DestRegOp->getSubReg() == SrcRegOp->getSubReg();
-    }
-
-    if (identityCopy || MI->isImplicitDef())
+    if (MI->isIdentityCopy() || MI->isImplicitDef())
       continue;
     if (!Visited.insert(MI).second)
       continue;
@@ -267,7 +258,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     }
 
     // Get allocation hints from copies.
-    if (!TII.isCopyInstr(*MI))
+    if (!MI->isCopy())
       continue;
     Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
     if (!HintReg)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 02c67e500bdc..952f454f8f6a 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -226,6 +226,7 @@ class ComplexDeinterleavingGraph {
   const TargetLowering *TL = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
   SmallVector<NodePtr> CompositeNodes;
+  DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult;
 
   SmallPtrSet<Instruction *, 16> FinalInstructions;
 
@@ -292,17 +293,11 @@ class ComplexDeinterleavingGraph {
 
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
+    if (Node->Real && Node->Imag)
+      CachedResult[{Node->Real, Node->Imag}] = Node;
     return Node;
   }
 
-  NodePtr getContainingComposite(Value *R, Value *I) {
-    for (const auto &CN : CompositeNodes) {
-      if (CN->Real == R && CN->Imag == I)
-        return CN;
-    }
-    return nullptr;
-  }
-
   /// Identifies a complex partial multiply pattern and its rotation, based on
   /// the following patterns
   ///
@@ -900,9 +895,11 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
   LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
   assert(R->getType() == I->getType() &&
          "Real and imaginary parts should not have different types");
-  if (NodePtr CN = getContainingComposite(R, I)) {
+
+  auto It = CachedResult.find({R, I});
+  if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
-    return CN;
+    return It->second;
   }
 
   if (NodePtr CN = identifySplat(R, I))
@@ -949,6 +946,7 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
     return CN;
 
   LLVM_DEBUG(dbgs() << "  - Not recognised as a valid pattern.\n");
+  CachedResult[{R, I}] = nullptr;
   return nullptr;
 }
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c62f3db9d321..277c6be418c5 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -256,11 +256,11 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
 // This minimizes register pressure and maximizes the store-to-load distance for
 // spill slots which can be important in tight loops.
 
-/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
-/// otherwise return 0.
-static Register isCopyOf(const MachineInstr &MI, Register Reg,
-                         const TargetInstrInfo &TII) {
-  if (!TII.isCopyInstr(MI))
+/// If MI is a COPY to or from Reg, return the other register, otherwise return
+/// 0.
+static Register isCopyOf(const MachineInstr &MI, Register Reg) {
+  assert(!MI.isBundled());
+  if (!MI.isCopy())
     return Register();
 
   const MachineOperand &DstOp = MI.getOperand(0);
@@ -277,10 +277,9 @@ static Register isCopyOf(const MachineInstr &MI, Register Reg,
 }
 
 /// Check for a copy bundle as formed by SplitKit.
-static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg,
-                               const TargetInstrInfo &TII) {
+static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) {
   if (!FirstMI.isBundled())
-    return isCopyOf(FirstMI, Reg, TII);
+    return isCopyOf(FirstMI, Reg);
 
   assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() &&
          "expected to see first instruction in bundle");
@@ -289,12 +288,11 @@ static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg,
   MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator();
   while (I->isBundledWithSucc()) {
     const MachineInstr &MI = *I;
-    auto CopyInst = TII.isCopyInstr(MI);
-    if (!CopyInst)
+    if (!MI.isCopy())
       return Register();
 
-    const MachineOperand &DstOp = *CopyInst->Destination;
-    const MachineOperand &SrcOp = *CopyInst->Source;
+    const MachineOperand &DstOp = MI.getOperand(0);
+    const MachineOperand &SrcOp = MI.getOperand(1);
     if (DstOp.getReg() == Reg) {
       if (!SnipReg)
         SnipReg = SrcOp.getReg();
@@ -360,7 +358,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
     MachineInstr &MI = *RI++;
 
     // Allow copies to/from Reg.
-    if (isCopyOfBundle(MI, Reg, TII))
+    if (isCopyOfBundle(MI, Reg))
       continue;
 
     // Allow stack slot loads.
@@ -398,7 +396,7 @@ void InlineSpiller::collectRegsToSpill() {
     return;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
-    Register SnipReg = isCopyOfBundle(MI, Reg, TII);
+    Register SnipReg = isCopyOfBundle(MI, Reg);
     if (!isSibling(SnipReg))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(SnipReg);
@@ -521,14 +519,14 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     // Find all spills and copies of VNI.
     for (MachineInstr &MI :
          llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) {
-      if (!MI.mayStore() && !TII.isCopyInstr(MI))
+      if (!MI.isCopy() && !MI.mayStore())
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
       if (LI->getVNInfoAt(Idx) != VNI)
         continue;
 
       // Follow sibling copies down the dominator tree.
-      if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) {
+      if (Register DstReg = isCopyOfBundle(MI, Reg)) {
         if (isSibling(DstReg)) {
           LiveInterval &DstLI = LIS.getInterval(DstReg);
           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
@@ -872,7 +870,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   if (Ops.back().first != MI || MI->isBundled())
     return false;
 
-  bool WasCopy = TII.isCopyInstr(*MI).has_value();
+  bool WasCopy = MI->isCopy();
   Register ImpReg;
 
   // TII::foldMemoryOperand will do what we need here for statepoint
@@ -1157,7 +1155,7 @@ void InlineSpiller::spillAroundUses(Register Reg) {
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    Register SibReg = isCopyOfBundle(MI, Reg, TII);
+    Register SibReg = isCopyOfBundle(MI, Reg);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index ff49e080090c..c3477cd8ce34 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -352,8 +352,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     // unlikely to change anything. We typically don't want to shrink the
     // PIC base register that has lots of uses everywhere.
     // Always shrink COPY uses that probably come from live range splitting.
-    if ((MI->readsVirtualRegister(Reg) &&
-         (MO.isDef() || TII.isCopyInstr(*MI))) ||
+    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) ||
         (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO))))
       ToShrink.insert(&LI);
     else if (MO.readsReg())
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4403b8..93f5314539cd 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -110,7 +109,6 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
 
@@ -199,7 +197,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           // is because it needs more accurate model to handle register
           // pressure correctly.
           MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
-          if (!TII.isCopyInstr(DefInstr))
+          if (!DefInstr.isCopy())
             NumEligibleUse++;
           Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
         } else {
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 4e80e9b58c06..523e077fd9a2 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -538,6 +538,10 @@ void MachineLICMBase::HoistRegionPostRA() {
         PhysRegDefs.set(*AI);
     }
 
+    // Funclet entry blocks will clobber all registers
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
+      PhysRegClobbers.setBitsNotInMask(Mask);
+
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
       ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 3448c56e4994..5b822b5d7b95 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -14,9 +14,10 @@
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -26,6 +27,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 using namespace llvm;
@@ -41,19 +43,19 @@ static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
 namespace {
 
 struct PreISelIntrinsicLowering {
+  const TargetMachine &TM;
   const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
-  const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
 
   /// If this is true, assume it's preferably to leave memory intrinsic calls
   /// for replacement with a library call later. Otherwise this depends on
-  /// TargetLibraryInfo availability of the corresponding function.
+  /// TargetLoweringInfo availability of the corresponding function.
   const bool UseMemIntrinsicLibFunc;
 
   explicit PreISelIntrinsicLowering(
+      const TargetMachine &TM_,
       function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
-      function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
       bool UseMemIntrinsicLibFunc_ = true)
-      : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
+      : TM(TM_), LookupTTI(LookupTTI_),
         UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
 
   static bool shouldExpandMemIntrinsicWithSize(Value *Size,
@@ -187,6 +189,13 @@ bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
   return SizeVal > Threshold || Threshold == 0;
 }
 
+static bool canEmitLibcall(const TargetMachine &TM, Function *F,
+                           RTLIB::Libcall LC) {
+  // TODO: Should this consider the address space of the memcpy?
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  return TLI->getLibcallName(LC) != nullptr;
+}
+
 // TODO: Handle atomic memcpy and memcpy.inline
 // TODO: Pass ScalarEvolution
 bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
@@ -203,9 +212,10 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
       if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
         if (UseMemIntrinsicLibFunc &&
-            LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMCPY))
           break;
 
+        // TODO: For optsize, emit the loop into a separate function
         expandMemCpyAsLoop(Memcpy, TTI);
         Changed = true;
         Memcpy->eraseFromParent();
@@ -219,7 +229,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
       if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
         if (UseMemIntrinsicLibFunc &&
-            LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMMOVE))
           break;
 
         if (expandMemMoveAsLoop(Memmove, TTI)) {
@@ -236,7 +246,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
       if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
         if (UseMemIntrinsicLibFunc &&
-            LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
           break;
 
         expandMemSetAsLoop(Memset);
@@ -357,8 +367,8 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
   PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
   }
 
   bool runOnModule(Module &M) override {
@@ -366,11 +376,8 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
       return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     };
 
-    auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    };
-
-    PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+    const auto &TM = getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    PreISelIntrinsicLowering Lowering(TM, LookupTTI);
     return Lowering.lowerIntrinsics(M);
   }
 };
@@ -379,27 +386,28 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
 
 char PreISelIntrinsicLoweringLegacyPass::ID;
 
-INITIALIZE_PASS(PreISelIntrinsicLoweringLegacyPass,
-                "pre-isel-intrinsic-lowering", "Pre-ISel Intrinsic Lowering",
-                false, false)
+INITIALIZE_PASS_BEGIN(PreISelIntrinsicLoweringLegacyPass,
+                      "pre-isel-intrinsic-lowering",
+                      "Pre-ISel Intrinsic Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PreISelIntrinsicLoweringLegacyPass,
+                    "pre-isel-intrinsic-lowering",
+                    "Pre-ISel Intrinsic Lowering", false, false)
 
 ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
-  return new PreISelIntrinsicLoweringLegacyPass;
+  return new PreISelIntrinsicLoweringLegacyPass();
 }
 
 PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
-    return FAM.getResult<TargetLibraryAnalysis>(F);
-  };
-
   auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
 
-  PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+  PreISelIntrinsicLowering Lowering(TM, LookupTTI);
   if (!Lowering.lowerIntrinsics(M))
     return PreservedAnalyses::all();
   else
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 68f6ea3268a9..48187e575494 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1282,12 +1282,10 @@ static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
 /// VirtReg.
 static bool readsLaneSubset(const MachineRegisterInfo &MRI,
                             const MachineInstr *MI, const LiveInterval &VirtReg,
-                            const TargetRegisterInfo *TRI, SlotIndex Use,
-                            const TargetInstrInfo *TII) {
+                            const TargetRegisterInfo *TRI, SlotIndex Use) {
   // Early check the common case.
-  auto DestSrc = TII->isCopyInstr(*MI);
-  if (DestSrc &&
-      DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg())
+  if (MI->isCopy() &&
+      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
     return false;
 
   // FIXME: We're only considering uses, but should be consider defs too?
@@ -1346,14 +1344,14 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
   // the allocation.
   for (const SlotIndex Use : Uses) {
     if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
-      if (TII->isFullCopyInstr(*MI) ||
+      if (MI->isFullCopy() ||
           (SplitSubClass &&
            SuperRCNumAllocatableRegs ==
                getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
                                                    TII, TRI, RegClassInfo)) ||
           // TODO: Handle split for subranges with subclass constraints?
           (!SplitSubClass && VirtReg.hasSubRanges() &&
-           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) {
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
@@ -2140,7 +2138,7 @@ void RAGreedy::initializeCSRCost() {
 /// \p Out is not cleared before being populated.
 void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
   for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
-    if (!TII->isFullCopyInstr(Instr))
+    if (!Instr.isFullCopy())
       continue;
     // Look for the other end of the copy.
     Register OtherReg = Instr.getOperand(0).getReg();
@@ -2455,10 +2453,9 @@ RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) {
            MI.getOpcode() == TargetOpcode::STATEPOINT;
   };
   for (MachineInstr &MI : MBB) {
-    auto DestSrc = TII->isCopyInstr(MI);
-    if (DestSrc) {
-      const MachineOperand &Dest = *DestSrc->Destination;
-      const MachineOperand &Src = *DestSrc->Source;
+    if (MI.isCopy()) {
+      const MachineOperand &Dest = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
       Register SrcReg = Src.getReg();
       Register DestReg = Dest.getReg();
       // Only count `COPY`s with a virtual register as source or destination.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index de909cc10795..235f0da86b90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20492,9 +20492,11 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
   SDValue Elt = Value.getOperand(1);
   SDValue Idx = Value.getOperand(2);
 
-  // If the element isn't byte sized then we can't compute an offset
+  // If the element isn't byte sized or is implicitly truncated then we can't
+  // compute an offset.
   EVT EltVT = Elt.getValueType();
-  if (!EltVT.isByteSized())
+  if (!EltVT.isByteSized() ||
+      EltVT != Value.getOperand(0).getValueType().getVectorElementType())
     return SDValue();
 
   auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5c1b19eba1c1..30d202494320 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1945,6 +1945,9 @@ SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
   assert(MulImm.getBitWidth() == VT.getSizeInBits() &&
          "APInt size does not match type size!");
 
+  if (MulImm == 0)
+    return getConstant(0, DL, VT);
+
   if (ConstantFold) {
     const MachineFunction &MF = getMachineFunction();
     auto Attr = MF.getFunction().getFnAttribute(Attribute::VScaleRange);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9595da9d0d8a..20c37eb4cb11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4156,6 +4156,18 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
 }
 
+static const MDNode *getRangeMetadata(const Instruction &I) {
+  // If !noundef is not present, then !range violation results in a poison
+  // value rather than immediate undefined behavior. In theory, transferring
+  // these annotations to SDAG is fine, but in practice there are key SDAG
+  // transforms that are known not to be poison-safe, such as folding logical
+  // and/or to bitwise and/or. For now, only transfer !range if !noundef is
+  // also present.
+  if (!I.hasMetadata(LLVMContext::MD_noundef))
+    return nullptr;
+  return I.getMetadata(LLVMContext::MD_range);
+}
+
 void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (I.isAtomic())
     return visitAtomicLoad(I);
@@ -4180,7 +4192,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
 
   Type *Ty = I.getType();
   SmallVector<EVT, 4> ValueVTs, MemVTs;
-  SmallVector<uint64_t, 4> Offsets;
+  SmallVector<TypeSize, 4> Offsets;
   ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets, 0);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
@@ -4188,7 +4200,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
 
   Align Alignment = I.getAlign();
   AAMDNodes AAInfo = I.getAAMetadata();
-  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(I);
   bool isVolatile = I.isVolatile();
   MachineMemOperand::Flags MMOFlags =
       TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
@@ -4219,14 +4231,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile)
     Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
 
-  // An aggregate load cannot wrap around the address space, so offsets to its
-  // parts don't wrap either.
-  SDNodeFlags Flags;
-  Flags.setNoUnsignedWrap(true);
-
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
-  EVT PtrVT = Ptr.getValueType();
 
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4243,13 +4249,15 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
       Root = Chain;
       ChainI = 0;
     }
-    SDValue A = DAG.getNode(ISD::ADD, dl,
-                            PtrVT, Ptr,
-                            DAG.getConstant(Offsets[i], dl, PtrVT),
-                            Flags);
 
-    SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A,
-                            MachinePointerInfo(SV, Offsets[i]), Alignment,
+    // TODO: MachinePointerInfo only supports a fixed length offset.
+    MachinePointerInfo PtrInfo =
+        !Offsets[i].isScalable() || Offsets[i].isZero()
+            ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+            : MachinePointerInfo();
+
+    SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
+    SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A, PtrInfo, Alignment,
                             MMOFlags, AAInfo, Ranges);
     Chains[ChainI] = L.getValue(1);
 
@@ -4351,7 +4359,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   }
 
   SmallVector<EVT, 4> ValueVTs, MemVTs;
-  SmallVector<uint64_t, 4> Offsets;
+  SmallVector<TypeSize, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
                   SrcV->getType(), ValueVTs, &MemVTs, &Offsets, 0);
   unsigned NumValues = ValueVTs.size();
@@ -4372,11 +4380,6 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 
   auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
 
-  // An aggregate load cannot wrap around the address space, so offsets to its
-  // parts don't wrap either.
-  SDNodeFlags Flags;
-  Flags.setNoUnsignedWrap(true);
-
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // See visitLoad comments.
@@ -4386,14 +4389,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
       Root = Chain;
       ChainI = 0;
     }
-    SDValue Add =
-        DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(Offsets[i]), dl, Flags);
+
+    // TODO: MachinePointerInfo only supports a fixed length offset.
+    MachinePointerInfo PtrInfo =
+        !Offsets[i].isScalable() || Offsets[i].isZero()
+            ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+            : MachinePointerInfo();
+
+    SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
     SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
     if (MemVTs[i] != ValueVTs[i])
       Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
     SDValue St =
-        DAG.getStore(Root, dl, Val, Add, MachinePointerInfo(PtrV, Offsets[i]),
-                     Alignment, MMOFlags, AAInfo);
+        DAG.getStore(Root, dl, Val, Add, PtrInfo, Alignment, MMOFlags, AAInfo);
     Chains[ChainI] = St;
   }
 
@@ -4607,7 +4615,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
     Alignment = DAG.getEVTAlign(VT);
 
   AAMDNodes AAInfo = I.getAAMetadata();
-  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(I);
 
   // Do not serialize masked loads of constant memory with anything.
   MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
@@ -4641,7 +4649,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
                         ->getMaybeAlignValue()
                         .value_or(DAG.getEVTAlign(VT.getScalarType()));
 
-  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(I);
 
   SDValue Root = DAG.getRoot();
   SDValue Base;
@@ -7396,7 +7404,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
 
     SDValue TripCount = getValue(I.getOperand(1));
-    auto VecTy = CCVT.changeVectorElementType(ElementVT);
+    EVT VecTy = EVT::getVectorVT(*DAG.getContext(), ElementVT,
+                                 CCVT.getVectorElementCount());
 
     SDValue VectorIndex = DAG.getSplat(VecTy, sdl, Index);
     SDValue VectorTripCount = DAG.getSplat(VecTy, sdl, TripCount);
@@ -7645,7 +7654,7 @@ void SelectionDAGBuilder::visitVPLoad(
   Value *PtrOperand = VPIntrin.getArgOperand(0);
   MaybeAlign Alignment = VPIntrin.getPointerAlignment();
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
-  const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
   SDValue LD;
   // Do not serialize variable-length loads of constant memory with
   // anything.
@@ -7672,7 +7681,7 @@ void SelectionDAGBuilder::visitVPGather(
   Value *PtrOperand = VPIntrin.getArgOperand(0);
   MaybeAlign Alignment = VPIntrin.getPointerAlignment();
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
-  const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
   SDValue LD;
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT.getScalarType());
@@ -7779,7 +7788,7 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
-  const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
   MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
   bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
@@ -9626,7 +9635,7 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
                                                     const Instruction &I,
                                                     SDValue Op) {
-  const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
+  const MDNode *Range = getRangeMetadata(I);
   if (!Range)
     return Op;
 
@@ -11310,8 +11319,32 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           }
         }
 
-        if (FallthroughUnreachable)
-          JTH->FallthroughUnreachable = true;
+        // If the default clause is unreachable, propagate that knowledge into
+        // JTH->FallthroughUnreachable which will use it to suppress the range
+        // check.
+        //
+        // However, don't do this if we're doing branch target enforcement,
+        // because a table branch _without_ a range check can be a tempting JOP
+        // gadget - out-of-bounds inputs that are impossible in correct
+        // execution become possible again if an attacker can influence the
+        // control flow. So if an attacker doesn't already have a BTI bypass
+        // available, we don't want them to be able to get one out of this
+        // table branch.
+        if (FallthroughUnreachable) {
+          Function &CurFunc = CurMF->getFunction();
+          bool HasBranchTargetEnforcement = false;
+          if (CurFunc.hasFnAttribute("branch-target-enforcement")) {
+            HasBranchTargetEnforcement =
+                CurFunc.getFnAttribute("branch-target-enforcement")
+                    .getValueAsBool();
+          } else {
+            HasBranchTargetEnforcement =
+                CurMF->getMMI().getModule()->getModuleFlag(
+                    "branch-target-enforcement");
+          }
+          if (!HasBranchTargetEnforcement)
+            JTH->FallthroughUnreachable = true;
+        }
 
         if (!JTH->FallthroughUnreachable)
           addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 83964eced597..eee54f09fbad 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -514,10 +514,10 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
-SlotIndex SplitEditor::buildSingleSubRegCopy(
-    Register FromReg, Register ToReg, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, unsigned SubIdx,
-    LiveInterval &DestLI, bool Late, SlotIndex Def, const MCInstrDesc &Desc) {
+SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
   bool FirstCopy = !Def.isValid();
   MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
       .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
@@ -536,8 +536,7 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(
 SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
-  const MCInstrDesc &Desc =
-      TII.get(TII.getLiveRangeSplitOpcode(FromReg, *MBB.getParent()));
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
   SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
     // The full vreg is copied.
@@ -565,7 +564,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   SlotIndex Def;
   for (unsigned BestIdx : SubIndexes) {
     Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
-                                DestLI, Late, Def, Desc);
+                                DestLI, Late, Def);
   }
 
   BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
@@ -1585,9 +1584,7 @@ bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
   if (BI.LiveIn && BI.LiveOut)
     return true;
   // No point in isolating a copy. It has no register class constraints.
-  MachineInstr *MI = LIS.getInstructionFromIndex(BI.FirstInstr);
-  bool copyLike = TII.isCopyInstr(*MI) || MI->isSubregToReg();
-  if (copyLike)
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
     return false;
   // Finally, don't isolate an end point that was created by earlier splits.
   return isOriginalEndpoint(BI.FirstInstr);
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 1174e392e4e4..f764ffd4750c 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -428,11 +428,8 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
       bool Late, unsigned RegIdx);
 
   SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg,
-                                  MachineBasicBlock &MB,
-                                  MachineBasicBlock::iterator InsertBefore,
-                                  unsigned SubIdx, LiveInterval &DestLI,
-                                  bool Late, SlotIndex Def,
-                                  const MCInstrDesc &Desc);
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
 
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 09dcddc17b06..b29404b42519 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -440,9 +440,8 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
 // If the COPY instruction in MI can be folded to a stack operation, return
 // the register class to use.
 static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
-                                              const TargetInstrInfo &TII,
                                               unsigned FoldIdx) {
-  assert(TII.isCopyInstr(MI) && "MI must be a COPY instruction");
+  assert(MI.isCopy() && "MI must be a COPY instruction");
   if (MI.getNumOperands() != 2)
     return nullptr;
   assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
@@ -631,10 +630,10 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
   }
 
   // Straight COPY may fold as load/store.
-  if (!isCopyInstr(MI) || Ops.size() != 1)
+  if (!MI.isCopy() || Ops.size() != 1)
     return nullptr;
 
-  const TargetRegisterClass *RC = canFoldCopy(MI, *this, Ops[0]);
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
   if (!RC)
     return nullptr;
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index badb7fe53333..68a4616fe4b8 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -921,7 +921,7 @@ void TargetLoweringBase::initActions() {
   // Legal, in which case all fp constants are legal, or use isFPImmLegal()
   // to optimize expansions for certain constants.
   setOperationAction(ISD::ConstantFP,
-                     {MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
+                     {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
                      Expand);
 
   // These library functions default to expand.
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 647f570ab807..55fb522554fa 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2426,23 +2426,6 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
         Name, Kind, XCOFF::CsectProperties(SMC, XCOFF::XTY_CM));
   }
 
-  if (Kind.isMergeableCString()) {
-    Align Alignment = GO->getParent()->getDataLayout().getPreferredAlign(
-        cast<GlobalVariable>(GO));
-
-    unsigned EntrySize = getEntrySizeForKind(Kind);
-    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
-    SmallString<128> Name;
-    Name = SizeSpec + utostr(Alignment.value());
-
-    if (TM.getDataSections())
-      getNameWithPrefix(Name, GO, TM);
-
-    return getContext().getXCOFFSection(
-        Name, Kind, XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD),
-        /* MultiSymbolsAllowed*/ !TM.getDataSections());
-  }
-
   if (Kind.isText()) {
     if (TM.getFunctionSections()) {
       return cast<MCSymbolXCOFF>(getFunctionEntryPointSymbol(GO, TM))
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6803d6ab1285..bc8abb751221 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -184,7 +184,6 @@ void llvm::computeLTOCacheKey(
     }
 
     const ModuleHash &getHash() const { return ModInfo->second.second; }
-    uint64_t getId() const { return ModInfo->second.first; }
   };
 
   std::vector<ImportModule> ImportModulesVector;
@@ -194,11 +193,11 @@ void llvm::computeLTOCacheKey(
        ++It) {
     ImportModulesVector.push_back({It, Index.getModule(It->getKey())});
   }
-  // Order using moduleId integer which is based on the order the module was
-  // added.
+  // Order using module hash, to be both independent of module name and
+  // module order.
   llvm::sort(ImportModulesVector,
              [](const ImportModule &Lhs, const ImportModule &Rhs) -> bool {
-               return Lhs.getId() < Rhs.getId();
+               return Lhs.getHash() < Rhs.getHash();
              });
   for (const ImportModule &Entry : ImportModulesVector) {
     auto ModHash = Entry.getHash();
diff --git a/llvm/lib/ObjCopy/wasm/WasmObject.h b/llvm/lib/ObjCopy/wasm/WasmObject.h
index 9bc5831926c6..f860ec697e56 100644
--- a/llvm/lib/ObjCopy/wasm/WasmObject.h
+++ b/llvm/lib/ObjCopy/wasm/WasmObject.h
@@ -23,6 +23,7 @@ struct Section {
   // For now, each section is only an opaque binary blob with no distinction
   // between custom and known sections.
   uint8_t SectionType;
+  std::optional<uint8_t> HeaderSecSizeEncodingLen;
   StringRef Name;
   ArrayRef<uint8_t> Contents;
 };
diff --git a/llvm/lib/ObjCopy/wasm/WasmReader.cpp b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
index 6e7d8b5591c9..578e78955af3 100644
--- a/llvm/lib/ObjCopy/wasm/WasmReader.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
@@ -22,8 +22,8 @@ Expected<std::unique_ptr<Object>> Reader::create() const {
   Obj->Sections.reserve(WasmObj.getNumSections());
   for (const SectionRef &Sec : WasmObj.sections()) {
     const WasmSection &WS = WasmObj.getWasmSection(Sec);
-    Obj->Sections.push_back(
-        {static_cast<uint8_t>(WS.Type), WS.Name, WS.Content});
+    Obj->Sections.push_back({static_cast<uint8_t>(WS.Type),
+                             WS.HeaderSecSizeEncodingLen, WS.Name, WS.Content});
     // Give known sections standard names to allow them to be selected. (Custom
     // sections already have their names filled in by the parser).
     Section &ReaderSec = Obj->Sections.back();
diff --git a/llvm/lib/ObjCopy/wasm/WasmWriter.cpp b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
index fdcd441cc798..bfab25ce8097 100644
--- a/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
@@ -29,16 +29,19 @@ Writer::SectionHeader Writer::createSectionHeader(const Section &S,
   SectionSize = S.Contents.size();
   if (HasName)
     SectionSize += getULEB128Size(S.Name.size()) + S.Name.size();
-  // Pad the LEB value out to 5 bytes to make it a predictable size, and
-  // match the behavior of clang.
-  encodeULEB128(SectionSize, OS, 5);
+  // If we read this section from an object file, use its original size for the
+  // padding of the LEB value to avoid changing the file size. Otherwise, pad
+  // out to 5 bytes to make it predictable, and match the behavior of clang.
+  unsigned HeaderSecSizeEncodingLen =
+      S.HeaderSecSizeEncodingLen ? *S.HeaderSecSizeEncodingLen : 5;
+  encodeULEB128(SectionSize, OS, HeaderSecSizeEncodingLen);
   if (HasName) {
     encodeULEB128(S.Name.size(), OS);
     OS << S.Name;
   }
   // Total section size is the content size plus 1 for the section type and
-  // 5 for the LEB-encoded size.
-  SectionSize = SectionSize + 1 + 5;
+  // the LEB-encoded size.
+  SectionSize = SectionSize + 1 + HeaderSecSizeEncodingLen;
   return Header;
 }
 
diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp
index e42dbe6f47ab..eee5505b8c14 100644
--- a/llvm/lib/Object/SymbolSize.cpp
+++ b/llvm/lib/Object/SymbolSize.cpp
@@ -84,16 +84,21 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
 
   array_pod_sort(Addresses.begin(), Addresses.end(), compareAddress);
 
-  // Compute the size as the gap to the next symbol
-  for (unsigned I = 0, N = Addresses.size() - 1; I < N; ++I) {
+  // Compute the size as the gap to the next symbol. If multiple symbols have
+  // the same address, give both the same size. Because Addresses is sorted,
+  // using two pointers to keep track of the current symbol vs. the next symbol
+  // that doesn't have the same address for size computation.
+  for (unsigned I = 0, NextI = 0, N = Addresses.size() - 1; I < N; ++I) {
     auto &P = Addresses[I];
     if (P.I == O.symbol_end())
       continue;
 
-    // If multiple symbol have the same address, give both the same size.
-    unsigned NextI = I + 1;
-    while (NextI < N && Addresses[NextI].Address == P.Address)
-      ++NextI;
+    // If the next pointer is behind, update it to the next symbol.
+    if (NextI <= I) {
+      NextI = I + 1;
+      while (NextI < N && Addresses[NextI].Address == P.Address)
+        ++NextI;
+    }
 
     uint64_t Size = Addresses[NextI].Address - P.Address;
     P.Address = Size;
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index a72242bc4ac2..11b9b579a8d7 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -268,7 +268,11 @@ static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
   Section.Offset = Ctx.Ptr - Ctx.Start;
   Section.Type = readUint8(Ctx);
   LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
+  // When reading the section's size, store the size of the LEB used to encode
+  // it. This allows objcopy/strip to reproduce the binary identically.
+  const uint8_t *PreSizePtr = Ctx.Ptr;
   uint32_t Size = readVaruint32(Ctx);
+  Section.HeaderSecSizeEncodingLen = Ctx.Ptr - PreSizePtr;
   if (Size == 0)
     return make_error<StringError>("zero length section",
                                    object_error::parse_failed);
diff --git a/llvm/lib/ObjectYAML/WasmEmitter.cpp b/llvm/lib/ObjectYAML/WasmEmitter.cpp
index 6230312eff7b..9b8fd11f8543 100644
--- a/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -646,8 +646,18 @@ bool WasmWriter::writeWasm(raw_ostream &OS) {
 
     StringStream.flush();
 
+    unsigned HeaderSecSizeEncodingLen =
+        Sec->HeaderSecSizeEncodingLen ? *Sec->HeaderSecSizeEncodingLen : 5;
+    unsigned RequiredLen = getULEB128Size(OutString.size());
+    // Wasm spec does not allow LEBs larger than 5 bytes
+    assert(RequiredLen <= 5);
+    if (HeaderSecSizeEncodingLen < RequiredLen) {
+      reportError("section header length can't be encoded in a LEB of size " +
+                  Twine(HeaderSecSizeEncodingLen));
+      return false;
+    }
     // Write the section size followed by the content
-    encodeULEB128(OutString.size(), OS);
+    encodeULEB128(OutString.size(), OS, HeaderSecSizeEncodingLen);
     OS << OutString;
   }
 
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 7ca422487df2..ef47766a2394 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -45,6 +45,7 @@ void MappingTraits<WasmYAML::Object>::mapping(IO &IO,
 static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
   IO.mapRequired("Type", Section.Type);
   IO.mapOptional("Relocations", Section.Relocations);
+  IO.mapOptional("HeaderSecSizeEncodingLen", Section.HeaderSecSizeEncodingLen);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::DylinkSection &Section) {
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 400bedabc003..86f28e578e5d 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -75,6 +75,13 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
   return Default;
 }
 
+bool ArgList::hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg,
+                             bool Default) const {
+  if (Arg *A = getLastArgNoClaim(Pos, Neg))
+    return A->getOption().matches(Pos);
+  return Default;
+}
+
 bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
                       bool Default) const {
   if (Arg *A = getLastArg(Pos, PosAlias, Neg))
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 759e15f4c443..a371bd21f026 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -593,10 +593,11 @@ bool TGParser::resolveArguments(Record *Rec, ArrayRef<ArgumentInit *> ArgValues,
   for (auto *UnsolvedArgName : UnsolvedArgNames) {
     Init *Default = Rec->getValue(UnsolvedArgName)->getValue();
     if (!Default->isComplete()) {
-      return Error(Loc, "value not specified for template argument (" +
-                            UnsolvedArgName->getAsUnquotedString() +
-                            ") of multiclass '" + Rec->getNameInitAsString() +
-                            "'");
+      std::string Name = UnsolvedArgName->getAsUnquotedString();
+      Error(Loc, "value not specified for template argument '" + Name + "'");
+      PrintNote(Rec->getFieldLoc(Name),
+                "declared in '" + Rec->getNameInitAsString() + "'");
+      return true;
     }
     ArgValueHandler(UnsolvedArgName, Default);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 05adbe27c948..8f50af4b71fd 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -148,6 +148,9 @@ def FeatureExperimentalZeroingPseudos
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
+def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
+  "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
+
 def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
   "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
   [FeatureSVE, FeatureUseScalarIncVL]>;
@@ -1137,7 +1140,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       FeatureLSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
-                                      FeaturePredictableSelectIsExpensive]>;
+                                      FeaturePredictableSelectIsExpensive,
+                                      FeatureNoSVEFPLD1R]>;
 
 def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2",
                                       "Neoverse V2 ARM processors", [
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d66800664c0c..4d5676f34101 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1908,6 +1908,7 @@ static void InsertReturnAddressAuth(MachineFunction &MF, MachineBasicBlock &MBB,
     return;
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  bool EmitAsyncCFI = MFI.needsAsyncDwarfUnwindInfo(MF);
 
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   DebugLoc DL;
@@ -1933,11 +1934,13 @@ static void InsertReturnAddressAuth(MachineFunction &MF, MachineBasicBlock &MBB,
         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
         .setMIFlag(MachineInstr::FrameDestroy);
 
-    unsigned CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameDestroy);
+    if (EmitAsyncCFI) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameDestroy);
+    }
     if (NeedsWinCFI) {
       *HasWinCFI = true;
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PACSignLR))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13df87af6c7b..0605dfa63793 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1091,6 +1091,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     if (Subtarget->hasFullFP16()) {
       setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+      setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
@@ -9757,7 +9758,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
   else if (VT == MVT::f32)
     IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
-  else if (VT == MVT::f16)
+  else if (VT == MVT::f16 || VT == MVT::bf16)
     IsLegal =
         (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
         Imm.isPosZero();
@@ -15479,15 +15480,15 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (!AM.HasBaseReg)
     return false;
 
-  // FIXME: Update this method to support scalable addressing modes.
-  if (Ty->isScalableTargetExtTy())
-    return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+  if (Ty->isScalableTy()) {
+    if (isa<ScalableVectorType>(Ty)) {
+      uint64_t VecElemNumBytes =
+          DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
+      return AM.HasBaseReg && !AM.BaseOffs &&
+             (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
+    }
 
-  if (isa<ScalableVectorType>(Ty)) {
-    uint64_t VecElemNumBytes =
-        DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
-    return AM.HasBaseReg && !AM.BaseOffs &&
-           (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
+    return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
   }
 
   // check reg + imm case:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cd2b9df27a24..39135df285c2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1306,6 +1306,11 @@ def fpimm16 : Operand<f16>,
   let PrintMethod = "printFPImmOperand";
 }
 
+def fpimmbf16 : Operand<bf16>,
+                FPImmLeaf<bf16, [{
+      return AArch64_AM::getFP16Imm(Imm) != -1;
+    }], fpimm16XForm>;
+
 def fpimm32 : Operand<f32>,
               FPImmLeaf<f32, [{
       return AArch64_AM::getFP32Imm(Imm) != -1;
@@ -8440,12 +8445,12 @@ class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
                                       string rhs_kind, RegisterOperand RegType,
                                       ValueType AccumType, ValueType InputType,
                                       SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128_lo,
                             VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
                 (AccumType (OpNode (AccumType RegType:$Rd),
                                    (InputType RegType:$Rn),
-                                   (InputType (AArch64duplane16 (v8f16 V128:$Rm),
+                                   (InputType (AArch64duplane16 (v8f16 V128_lo:$Rm),
                                                 VectorIndexH:$idx)))))]> {
   // idx = H:L:M
   bits<3> idx;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9d901fd70446..30bd580ad86a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2228,7 +2228,6 @@ bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
   case AArch64::LDRWpre:
   case AArch64::LDURXi:
   case AArch64::LDRXpre:
-  case AArch64::LDRSWpre:
   case AArch64::LDURSWi:
   case AArch64::LDURHHi:
   case AArch64::LDURBBi:
@@ -2438,7 +2437,6 @@ bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
   case AArch64::LDURXi:
   case AArch64::LDRXpre:
   case AArch64::LDURSWi:
-  case AArch64::LDRSWpre:
     return true;
   }
 }
@@ -2559,8 +2557,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
   // Can't merge/pair if the instruction modifies the base register.
   // e.g., ldr x0, [x0]
   // This case will never occur with an FI base.
-  // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
-  // STR<S,D,Q,W,X>pre, it can be merged.
+  // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged.
   // For example:
   //   ldr q0, [x11, #32]!
   //   ldr q1, [x11, #16]
@@ -3137,7 +3134,6 @@ int AArch64InstrInfo::getMemScale(unsigned Opc) {
   case AArch64::LDRSpre:
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
-  case AArch64::LDRSWpre:
   case AArch64::LDRWpre:
   case AArch64::LDRWui:
   case AArch64::LDURWi:
@@ -3193,7 +3189,6 @@ bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
     return false;
   case AArch64::LDRWpre:
   case AArch64::LDRXpre:
-  case AArch64::LDRSWpre:
   case AArch64::LDRSpre:
   case AArch64::LDRDpre:
   case AArch64::LDRQpre:
@@ -5438,8 +5433,8 @@ static bool getFNEGPatterns(MachineInstr &Root,
   auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
     MachineOperand &MO = Root.getOperand(1);
     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
-    if (MI != nullptr && MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
-        (MI->getOpcode() == Opcode) &&
+    if (MI != nullptr && (MI->getOpcode() == Opcode) &&
+        MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3450ed29d142..9e72d37880c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -262,6 +262,8 @@ def UseNegativeImmediates
 
 def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
 
+def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+
 def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
 
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
@@ -2251,7 +2253,7 @@ def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
 // Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
 // $Rn_wback is one past the end of the range. $Rm is the loop counter.
-let isCodeGenOnly=1, mayStore=1 in {
+let isCodeGenOnly=1, mayStore=1, Defs=[NZCV] in {
 def STGloop_wback
     : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
              [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
@@ -4355,16 +4357,23 @@ def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
 def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
     Sched<[WriteF]>;
 }
+
 // Similarly add aliases
 def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
     Requires<[HasFullFP16]>;
 def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
 def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;
 
-// Pattern for FP16 immediates
+def : Pat<(bf16 fpimm0),
+          (FMOVH0)>;
+
+// Pattern for FP16 and BF16 immediates
 let Predicates = [HasFullFP16] in {
   def : Pat<(f16 fpimm:$in),
-    (FMOVWHr (MOVi32imm (bitcast_fpimm_to_i32 f16:$in)))>;
+            (FMOVWHr (MOVi32imm (bitcast_fpimm_to_i32 f16:$in)))>;
+
+  def : Pat<(bf16 fpimm:$in),
+            (FMOVWHr (MOVi32imm (bitcast_fpimm_to_i32 bf16:$in)))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4617,6 +4626,11 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 defm FMOV : FPMoveImmediate<"fmov">;
 }
 
+let Predicates = [HasFullFP16] in {
+  def : Pat<(bf16 fpimmbf16:$in),
+            (FMOVHi (fpimm16XForm bf16:$in))>;
+}
+
 //===----------------------------------------------------------------------===//
 // Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 419b471db3a3..41af5522d967 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -293,8 +293,6 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
     return AArch64::LDRWui;
   case AArch64::LDURSWi:
     return AArch64::LDURWi;
-  case AArch64::LDRSWpre:
-    return AArch64::LDRWpre;
   }
 }
 
@@ -374,8 +372,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
     return AArch64::LDPSWi;
-  case AArch64::LDRSWpre:
-    return AArch64::LDPSWpre;
   }
 }
 
@@ -589,8 +585,6 @@ static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
     return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
   case AArch64::LDRXpre:
     return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
-  case AArch64::LDRSWpre:
-    return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
   }
 }
 
@@ -1346,7 +1340,7 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
     return false;
 
   // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
-  // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
+  // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
   // are candidate pairs that can be merged.
   if (isPreLdStPairCandidate(FirstMI, MI))
     return true;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ad404e8dab2a..b4f02e0dd203 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -204,10 +204,18 @@ def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
 def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
 def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>;
 
+def AArch64fadd_p_contract : PatFrag<(ops node:$op1, node:$op2, node:$op3),
+                                     (AArch64fadd_p node:$op1, node:$op2, node:$op3), [{
+  return N->getFlags().hasAllowContract();
+}]>;
 def AArch64fadd_p_nsz : PatFrag<(ops node:$op1, node:$op2, node:$op3),
                                 (AArch64fadd_p node:$op1, node:$op2, node:$op3), [{
   return N->getFlags().hasNoSignedZeros();
 }]>;
+def AArch64fsub_p_contract : PatFrag<(ops node:$op1, node:$op2, node:$op3),
+                                     (AArch64fsub_p node:$op1, node:$op2, node:$op3), [{
+  return N->getFlags().hasAllowContract();
+}]>;
 def AArch64fsub_p_nsz : PatFrag<(ops node:$op1, node:$op2, node:$op3),
                                 (AArch64fsub_p node:$op1, node:$op2, node:$op3), [{
   return N->getFlags().hasNoSignedZeros();
@@ -363,14 +371,12 @@ def AArch64fabd_p : PatFrags<(ops node:$pg, node:$op1, node:$op2),
                               (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)]>;
 
 def AArch64fmla_p : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
-                             [(AArch64fma_p node:$pg, node:$zn, node:$zm, node:$za),
-                              (vselect node:$pg, (AArch64fma_p (AArch64ptrue 31), node:$zn, node:$zm, node:$za), node:$za)]>;
+                             [(AArch64fma_p node:$pg, node:$zn, node:$zm, node:$za)]>;
 
 def AArch64fmls_p : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
                              [(int_aarch64_sve_fmls_u node:$pg, node:$za, node:$zn, node:$zm),
                               (AArch64fma_p node:$pg, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$zm, node:$za),
-                              (AArch64fma_p node:$pg, node:$zm, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$za),
-                              (vselect node:$pg, (AArch64fma_p (AArch64ptrue 31), (AArch64fneg_mt (AArch64ptrue 31), node:$zn, (undef)), node:$zm, node:$za), node:$za)]>;
+                              (AArch64fma_p node:$pg, node:$zm, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$za)]>;
 
 def AArch64fnmla_p : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
                               [(int_aarch64_sve_fnmla_u node:$pg, node:$za, node:$zn, node:$zm),
@@ -423,18 +429,15 @@ def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3),
                            [(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3),
                             (xor node:$op1, (xor node:$op2, node:$op3))]>;
 
-class fma_patfrags<SDPatternOperator intrinsic, SDPatternOperator add>
-    : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),
-               [(intrinsic node:$pred, node:$op1, node:$op2, node:$op3),
-                (vselect node:$pred, (add (SVEAllActive), node:$op1, (AArch64fmul_p_oneuse (SVEAllActive), node:$op2, node:$op3)), node:$op1)],
-[{
-  if (N->getOpcode() == ISD::VSELECT)
-    return N->getOperand(1)->getFlags().hasAllowContract();
-  return true;  // it's the intrinsic
-}]>;
+def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
+                              [(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm),
+                               (vselect node:$pg, (AArch64fadd_p_contract (SVEAllActive), node:$za, (AArch64fmul_p_oneuse (SVEAllActive), node:$zn, node:$zm)), node:$za),
+                               (vselect node:$pg, (AArch64fma_p (SVEAllActive), node:$zn, node:$zm, node:$za), node:$za)]>;
 
-def AArch64fmla_m1 : fma_patfrags<int_aarch64_sve_fmla, AArch64fadd_p>;
-def AArch64fmls_m1 : fma_patfrags<int_aarch64_sve_fmls, AArch64fsub_p>;
+def AArch64fmls_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),
+                              [(int_aarch64_sve_fmls node:$pg, node:$za, node:$zn, node:$zm),
+                               (vselect node:$pg, (AArch64fsub_p_contract (SVEAllActive), node:$za, (AArch64fmul_p_oneuse (SVEAllActive), node:$zn, node:$zm)), node:$za),
+                               (vselect node:$pg, (AArch64fma_p (SVEAllActive), (AArch64fneg_mt (SVEAllActive), node:$zn, (undef)), node:$zm, node:$za), node:$za)]>;
 
 def AArch64add_m1 : VSelectUnpredOrPassthruPatFrags<int_aarch64_sve_add, add>;
 def AArch64sub_m1 : VSelectUnpredOrPassthruPatFrags<int_aarch64_sve_sub, sub>;
@@ -2352,13 +2355,15 @@ let Predicates = [HasSVEorSME] in {
   // LDR1 of 64-bit data
   defm : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
 
-  // LD1R of FP data
-  defm : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
-  defm : LD1RPat<nxv4f32, load, LD1RW_IMM,   PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
-  defm : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
-  defm : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+  let Predicates = [HasSVEorSME, UseSVEFPLD1R] in {
+    // LD1R of FP data
+    defm : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
+    defm : LD1RPat<nxv4f32, load, LD1RW_IMM,   PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
+    defm : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
+    defm : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+  }
 
 // LD1R of 128-bit masked data
   multiclass ld1rq_pat<ValueType vt1, SDPatternOperator op, Instruction load_instr, ComplexPattern AddrCP>{
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 9ab86684856e..5e20d16464c4 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -394,7 +394,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   bool useSVEForFixedLengthVectors() const {
     if (!isNeonAvailable())
-      return true;
+      return hasSVE();
 
     // Prefer NEON unless larger SVE registers are available.
     return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index a66d2ddee652..e78d8bb487a9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -564,6 +564,11 @@ void AArch64CallLowering::saveVarArgRegisters(
     if (IsWin64CC) {
       GPRIdx = MFI.CreateFixedObject(GPRSaveSize,
                                      -static_cast<int>(GPRSaveSize), false);
+      if (GPRSaveSize & 15)
+        // The extra size here, if triggered, will always be 8.
+        MFI.CreateFixedObject(16 - (GPRSaveSize & 15),
+                              -static_cast<int>(alignTo(GPRSaveSize, 16)),
+                              false);
     } else
       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 118862b8c317..4902ec3639ec 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2317,7 +2317,10 @@ multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
            SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
 
   def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_4_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_4_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_4_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
@@ -7200,6 +7203,10 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
 
   def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
             (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv4f16 (op nxv4i1:$pg, f16:$splat, nxv4f16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv2f16 (op nxv2i1:$pg, f16:$splat, nxv2f16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
   def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
             (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
   def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b82db82de84e..c25194c02f72 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -41,7 +41,6 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIOptimizeExecMaskingPreRAPass();
 FunctionPass *createSIOptimizeVGPRLiveRangePass();
 FunctionPass *createSIFixSGPRCopiesPass();
-FunctionPass *createLowerWWMCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
@@ -145,9 +144,6 @@ extern char &SIFixSGPRCopiesID;
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
-void initializeSILowerWWMCopiesPass(PassRegistry &);
-extern char &SILowerWWMCopiesID;
-
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 254d02d4ce5b..39e00a037bdd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2528,7 +2528,7 @@ SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
 std::pair<SDValue, SDValue>
 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
                                         SDValue Src, SDNodeFlags Flags) const {
-  if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+  if (!needsDenormHandlingF32(DAG, Src, Flags))
     return {};
 
   MVT VT = MVT::f32;
@@ -2609,9 +2609,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
       X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
     }
 
-    SDValue Lowered = LowerFLOGUnsafe(
-        X, DL, DAG, IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2,
-        Flags);
+    SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
     if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
       return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
                          DAG.getTargetConstant(0, DL, MVT::i32), Flags);
@@ -2696,11 +2694,36 @@ SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
 // promote f16 operation.
 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
-                                              SelectionDAG &DAG,
-                                              double Log2BaseInverted,
+                                              SelectionDAG &DAG, bool IsLog10,
                                               SDNodeFlags Flags) const {
   EVT VT = Src.getValueType();
   unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2;
+
+  double Log2BaseInverted =
+      IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
+
+  if (VT == MVT::f32) {
+    auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
+    if (ScaledInput) {
+      SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
+      SDValue ScaledResultOffset =
+          DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
+
+      SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
+
+      SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
+                                         ScaledResultOffset, Zero, Flags);
+
+      SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
+
+      if (Subtarget->hasFastFMAF32())
+        return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
+                           Flags);
+      SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
+      return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
+    }
+  }
+
   SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
 
@@ -2728,7 +2751,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
 
   assert(VT == MVT::f32);
 
-  if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+  if (!needsDenormHandlingF32(DAG, Src, Flags))
     return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
 
   // bool needs_scaling = x < -0x1.f80000p+6f;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 26b91155ba85..c39093b9bb6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -72,7 +72,7 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
-                          double Log2BaseInverted, SDNodeFlags Flags) const;
+                          bool IsLog10, SDNodeFlags Flags) const;
   SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 120c00b14a36..bbf4db12f5ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3037,8 +3037,7 @@ static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
 std::pair<Register, Register>
 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
                                        unsigned Flags) const {
-  if (allowApproxFunc(B.getMF(), Flags) ||
-      !needsDenormHandlingF32(B.getMF(), Src, Flags))
+  if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
     return {};
 
   const LLT F32 = LLT::scalar(32);
@@ -3132,16 +3131,13 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
 
   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
-    const double Log2BaseInv =
-        IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
-
     if (Ty == F16 && !ST.has16BitInsts()) {
       Register LogVal = MRI.createGenericVirtualRegister(F32);
       auto PromoteSrc = B.buildFPExt(F32, X);
-      legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags);
+      legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
       B.buildFPTrunc(Dst, LogVal);
     } else {
-      legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags);
+      legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
     }
 
     MI.eraseFromParent();
@@ -3225,10 +3221,36 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
 }
 
 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
-                                             Register Src,
-                                             double Log2BaseInverted,
+                                             Register Src, bool IsLog10,
                                              unsigned Flags) const {
+  const double Log2BaseInverted =
+      IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
+
   LLT Ty = B.getMRI()->getType(Dst);
+
+  if (Ty == LLT::scalar(32)) {
+    auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
+    if (ScaledInput) {
+      auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+                        .addUse(Src)
+                        .setMIFlags(Flags);
+      auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
+      auto Zero = B.buildFConstant(Ty, 0.0);
+      auto ResultOffset =
+          B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
+      auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
+
+      if (ST.hasFastFMAF32())
+        B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
+      else {
+        auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
+        B.buildFAdd(Dst, Mul, ResultOffset, Flags);
+      }
+
+      return true;
+    }
+  }
+
   auto Log2Operand = Ty == LLT::scalar(16)
                          ? B.buildFLog2(Ty, Src, Flags)
                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
@@ -3264,11 +3286,10 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
 
   assert(Ty == F32);
 
-  if (allowApproxFunc(B.getMF(), Flags) ||
-      !needsDenormHandlingF32(B.getMF(), Src, Flags)) {
+  if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
-      .addUse(Src)
-      .setMIFlags(Flags);
+        .addUse(Src)
+        .setMIFlags(Flags);
     MI.eraseFromParent();
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 04773f275c87..534bb2c87ea3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -85,7 +85,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
-                          double Log2BaseInverted, unsigned Flags) const;
+                          bool IsLog10, unsigned Flags) const;
   bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
                           unsigned Flags) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1d69f0434b58..17025867c1da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -386,6 +386,8 @@ static Value *promoteAllocaUserToVector(
   };
 
   Type *VecEltTy = VectorTy->getElementType();
+  const unsigned NumVecElts = VectorTy->getNumElements();
+
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
     // Loads can only be lowered if the value is known.
@@ -413,13 +415,13 @@ static Value *promoteAllocaUserToVector(
     // Loading a subvector.
     if (isa<FixedVectorType>(AccessTy)) {
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
-      const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
-      auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
+      const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
+      auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
       unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
       Value *SubVec = PoisonValue::get(SubVecTy);
-      for (unsigned K = 0; K < NumElts; ++K) {
+      for (unsigned K = 0; K < NumLoadedElts; ++K) {
         SubVec = Builder.CreateInsertElement(
             SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K);
       }
@@ -465,8 +467,9 @@ static Value *promoteAllocaUserToVector(
     // Storing a subvector.
     if (isa<FixedVectorType>(AccessTy)) {
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
-      const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
-      auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
+      const unsigned NumWrittenElts =
+          AccessSize / DL.getTypeStoreSize(VecEltTy);
+      auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
       if (SubVecTy->isPtrOrPtrVectorTy())
@@ -478,7 +481,8 @@ static Value *promoteAllocaUserToVector(
 
       unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
       Value *CurVec = GetOrLoadCurrentVectorValue();
-      for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) {
+      for (unsigned K = 0; K < NumWrittenElts && ((IndexVal + K) < NumVecElts);
+           ++K) {
         CurVec = Builder.CreateInsertElement(
             CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K);
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f90c8e4bdddd..87ef2333e2ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -364,7 +364,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUDAGToDAGISelPass(*PR);
   initializeGCNDPPCombinePass(*PR);
   initializeSILowerI1CopiesPass(*PR);
-  initializeSILowerWWMCopiesPass(*PR);
   initializeSILowerSGPRSpillsPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
@@ -1297,7 +1296,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
-  addPass(&SILowerWWMCopiesID);
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
   return true;
@@ -1352,8 +1350,6 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
   addPass(&SILowerSGPRSpillsID);
 
   addPass(createVGPRAllocPass(false));
-
-  addPass(&SILowerWWMCopiesID);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 865caae240f3..903e726c667d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -99,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
           SGPR, PrologEpilogSGPRSaveRestoreInfo(
                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
 
-      LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
-                 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
-                        << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
-                        << '\n';);
+      LLVM_DEBUG(
+          auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
+          dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+                 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
     } else {
       // Remove dead <FI> index
       MF.getFrameInfo().RemoveStackObject(FI);
@@ -264,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {
 
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
+        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -309,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
   void restoreFromVGPRLane(const int FI) {
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
+        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -1353,8 +1353,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
                                                 TRI->isAGPR(MRI, VReg))) {
             assert(RS != nullptr);
-            RS->enterBasicBlockEnd(MBB);
-            RS->backward(MI);
+            // FIXME: change to enterBasicBlockEnd()
+            RS->enterBasicBlock(MBB);
             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
             SpillFIs.set(FI);
             continue;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3148f49ff0d5..b7b90e23e895 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -278,10 +278,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       case ISD::UNDEF:
       case ISD::EXTRACT_VECTOR_ELT:
       case ISD::INSERT_VECTOR_ELT:
-      case ISD::EXTRACT_SUBVECTOR:
       case ISD::SCALAR_TO_VECTOR:
       case ISD::IS_FPCLASS:
         break;
+      case ISD::EXTRACT_SUBVECTOR:
       case ISD::INSERT_SUBVECTOR:
       case ISD::CONCAT_VECTORS:
         setOperationAction(Op, VT, Custom);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4b0283b27a6f..a74b917f82bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -382,6 +382,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   bool ForceEmitZeroWaitcnts;
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
+  bool OptNone;
+
   // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
@@ -1040,7 +1042,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // do this if there are no outstanding scratch stores.
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
         ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
       ReleaseVGPRInsts.insert(&MI);
@@ -1822,6 +1824,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
+  OptNone = MF.getFunction().hasOptNone() ||
+            MF.getTarget().getOptLevel() == CodeGenOpt::None;
+
   HardwareLimits Limits = {};
   Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 278cf2b69ee3..0f954732a5ee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2414,14 +2414,6 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   return std::pair(Split[0], Split[1]);
 }
 
-std::optional<DestSourcePair>
-SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
-  if (MI.getOpcode() == AMDGPU::WWM_COPY)
-    return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
-
-  return std::nullopt;
-}
-
 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
                                       MachineOperand &Src0,
                                       unsigned Src0OpName,
@@ -3088,7 +3080,6 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
   case AMDGPU::COPY:
-  case AMDGPU::WWM_COPY:
   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
   case AMDGPU::V_ACCVGPR_READ_B32_e64:
   case AMDGPU::V_ACCVGPR_MOV_B32:
@@ -4978,8 +4969,7 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         const DebugLoc &DL, Register Reg,
-                                        bool IsSCCLive,
-                                        SlotIndexes *Indexes) const {
+                                        bool IsSCCLive) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool IsWave32 = ST.isWave32();
@@ -4989,34 +4979,23 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
-                           .addReg(Exec, RegState::Kill);
-    auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
-    if (Indexes) {
-      Indexes->insertMachineInstrInMaps(*StoreExecMI);
-      Indexes->insertMachineInstrInMaps(*FlipExecMI);
-    }
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
   } else {
     const unsigned OrSaveExec =
         IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
     auto SaveExec =
         BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
-    if (Indexes)
-      Indexes->insertMachineInstrInMaps(*SaveExec);
   }
 }
 
 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              const DebugLoc &DL, Register Reg,
-                              SlotIndexes *Indexes) const {
+                              const DebugLoc &DL, Register Reg) const {
   unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  auto ExecRestoreMI =
-      BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
-  if (Indexes)
-    Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
+  BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
 }
 
 static const TargetRegisterClass *
@@ -8001,16 +7980,6 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
-unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
-                                              const MachineFunction &MF) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  assert(SrcReg.isVirtual());
-  if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
-    return AMDGPU::WWM_COPY;
-
-  return AMDGPU::COPY;
-}
-
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
          MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -8562,7 +8531,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
   // A similar issue also exists with spilling and reloading $exec registers.
   //
   // To prevent that, constrain the %0 register class here.
-  if (isFullCopyInstr(MI)) {
+  if (MI.isFullCopy()) {
     Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
@@ -8659,7 +8628,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
   if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
     return InstructionUniformity::AlwaysUniform;
 
-  if (isCopyInstr(MI)) {
+  if (MI.isCopy()) {
     const MachineOperand &srcOp = MI.getOperand(1);
     if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
       const TargetRegisterClass *regClass =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b25aae7b2fb0..66f93e5640d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -170,12 +170,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
 protected:
-  /// If the specific machine instruction is a instruction that moves/copies
-  /// value from one register to another register return destination and source
-  /// registers as machine operands.
-  std::optional<DestSourcePair>
-  isCopyInstrImpl(const MachineInstr &MI) const override;
-
   bool swapSourceModifiers(MachineInstr &MI,
                            MachineOperand &Src0, unsigned Src0OpName,
                            MachineOperand &Src1, unsigned Src1OpName) const;
@@ -833,7 +827,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   bool isVGPRCopy(const MachineInstr &MI) const {
-    assert(isCopyInstr(MI));
+    assert(MI.isCopy());
     Register Dest = MI.getOperand(0).getReg();
     const MachineFunction &MF = *MI.getParent()->getParent();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -903,7 +897,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     if (OpIdx >= MI.getDesc().NumOperands)
       return false;
 
-    if (isCopyInstr(MI)) {
+    if (MI.isCopy()) {
       unsigned Size = getOpSize(MI, OpIdx);
       assert(Size == 8 || Size == 4);
 
@@ -952,12 +946,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MBBI,
-                             const DebugLoc &DL, Register Reg, bool IsSCCLive,
-                             SlotIndexes *Indexes = nullptr) const;
+                             const DebugLoc &DL, Register Reg,
+                             bool IsSCCLive) const;
 
   void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                   Register Reg, SlotIndexes *Indexes = nullptr) const;
+                   Register Reg) const;
 
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
@@ -1149,9 +1143,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
                                  const ScheduleDAGMI *DAG) const override;
 
-  unsigned getLiveRangeSplitOpcode(Register Reg,
-                                   const MachineFunction &MF) const override;
-
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7fe76b4c13ca..2edebccef7d8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -172,13 +172,6 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
-def WWM_COPY : SPseudoInstSI <
-  (outs unknown:$dst), (ins unknown:$src)> {
-  let hasSideEffects = 0;
-  let isAsCheapAsAMove = 1;
-  let isConvergent = 1;
-}
-
 def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
   let Uses = [EXEC];
   let Defs = [EXEC, SCC];
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 47d28d5d0eab..d21107c02ef7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -50,9 +50,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
   SILowerSGPRSpills() : MachineFunctionPass(ID) {}
 
   void calculateSaveRestoreBlocks(MachineFunction &MF);
-  bool spillCalleeSavedRegs(MachineFunction &MF,
-                            SmallVectorImpl<int> &CalleeSavedFIs);
-  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
+  bool spillCalleeSavedRegs(MachineFunction &MF);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -60,13 +58,6 @@ class SILowerSGPRSpills : public MachineFunctionPass {
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
-
-  MachineFunctionProperties getClearedProperties() const override {
-    // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
-    return MachineFunctionProperties()
-        .set(MachineFunctionProperties::Property::IsSSA)
-        .set(MachineFunctionProperties::Property::NoVRegs);
-  }
 };
 
 } // end anonymous namespace
@@ -206,8 +197,7 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
   EntryBB.sortUniqueLiveIns();
 }
 
-bool SILowerSGPRSpills::spillCalleeSavedRegs(
-    MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
+bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -238,7 +228,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
                                            TRI->getSpillAlign(*RC), true);
 
         CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
-        CalleeSavedFIs.push_back(JunkFI);
       }
     }
 
@@ -259,50 +248,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
   return false;
 }
 
-void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
-                                                 LiveIntervals *LIS) {
-  // TODO: This is a workaround to avoid the unmodelled liveness computed with
-  // whole-wave virtual registers when allocated together with the regular VGPR
-  // virtual registers. Presently, the liveness computed during the regalloc is
-  // only uniform (or single lane aware) and it doesn't take account of the
-  // divergent control flow that exists for our GPUs. Since the WWM registers
-  // can modify inactive lanes, the wave-aware liveness should be computed for
-  // the virtual registers to accurately plot their interferences. Without
-  // having the divergent CFG for the function, it is difficult to implement the
-  // wave-aware liveness info. Until then, we conservatively extend the liveness
-  // of the wwm registers into the entire function so that they won't be reused
-  // without first spilling/splitting their liveranges.
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
-  for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-    for (MachineBasicBlock *SaveBlock : SaveBlocks) {
-      MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
-      auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
-                         TII->get(AMDGPU::IMPLICIT_DEF), Reg);
-      MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
-      if (LIS) {
-        LIS->InsertMachineInstrInMaps(*MIB);
-      }
-    }
-  }
-
-  // Insert the KILL in the return blocks to extend their liveness untill the
-  // end of function. Insert a separate KILL for each VGPR.
-  for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
-    MachineBasicBlock::iterator InsertBefore =
-        RestoreBlock->getFirstTerminator();
-    for (auto Reg : MFI->getSGPRSpillVGPRs()) {
-      auto MIB =
-          BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
-                  TII->get(TargetOpcode::KILL));
-      MIB.addReg(Reg);
-      if (LIS)
-        LIS->InsertMachineInstrInMaps(*MIB);
-    }
-  }
-}
-
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -316,8 +261,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   // First, expose any CSR SGPR spills. This is mostly the same as what PEI
   // does, but somewhat simpler.
   calculateSaveRestoreBlocks(MF);
-  SmallVector<int> CalleeSavedFIs;
-  bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
+  bool HasCSRs = spillCalleeSavedRegs(MF);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -331,7 +275,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   bool NewReservedRegs = false;
-  bool SpilledToVirtVGPRLanes = false;
 
   // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
   // handled as SpilledToReg in regular PrologEpilogInserter.
@@ -354,53 +297,23 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
         int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
         assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
-
-        bool IsCalleeSaveSGPRSpill =
-            std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
-            CalleeSavedFIs.end();
-        if (IsCalleeSaveSGPRSpill) {
-          // Spill callee-saved SGPRs into physical VGPR lanes.
-
-          // TODO: This is to ensure the CFIs are static for efficient frame
-          // unwinding in the debugger. Spilling them into virtual VGPR lanes
-          // involve regalloc to allocate the physical VGPRs and that might
-          // cause intermediate spill/split of such liveranges for successful
-          // allocation. This would result in broken CFI encoding unless the
-          // regalloc aware CFI generation to insert new CFIs along with the
-          // intermediate spills is implemented. There is no such support
-          // currently exist in the LLVM compiler.
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
-            NewReservedRegs = true;
-            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
-                MI, FI, nullptr, Indexes, LIS, true);
-            if (!Spilled)
-              llvm_unreachable(
-                  "failed to spill SGPR to physical VGPR lane when allocated");
-          }
-        } else {
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
-            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
-                MI, FI, nullptr, Indexes, LIS);
-            if (!Spilled)
-              llvm_unreachable(
-                  "failed to spill SGPR to virtual VGPR lane when allocated");
-            SpillFIs.set(FI);
-            SpilledToVirtVGPRLanes = true;
-          }
+        if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
+          NewReservedRegs = true;
+          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+              MI, FI, nullptr, Indexes, LIS);
+          (void)Spilled;
+          assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          SpillFIs.set(FI);
         }
       }
     }
 
-    if (SpilledToVirtVGPRLanes) {
-      extendWWMVirtRegLiveness(MF, LIS);
-      if (LIS) {
-        // Compute the LiveInterval for the newly created virtual registers.
-        for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
-          LIS->createAndComputeVirtRegInterval(Reg);
-      }
-    }
-
+    // FIXME: Adding to live-ins redundant with reserving registers.
     for (MachineBasicBlock &MBB : MF) {
+      for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
+      MBB.sortUniqueLiveIns();
+
       // FIXME: The dead frame indices are replaced with a null register from
       // the debug value instructions. We should instead, update it with the
       // correct register value. But not sure the register value alone is
@@ -421,10 +334,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     // lane".
     FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
 
-    MadeChange = true;
-  }
-
-  if (SpilledToVirtVGPRLanes) {
     const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
     // Shift back the reserved SGPR for EXEC copy into the lowest range.
     // This SGPR is reserved to handle the whole-wave spill/copy operations
@@ -433,21 +342,20 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
                              TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
       FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
+
+    MadeChange = true;
   } else {
-    // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
-    // spills/copies. Reset the SGPR reserved for EXEC copy.
+    // No SGPR spills and hence there won't be any WWM spills/copies. Reset the
+    // SGPR reserved for EXEC copy.
     FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
   }
 
   SaveBlocks.clear();
   RestoreBlocks.clear();
 
-  // Updated the reserved registers with any physical VGPRs added for SGPR
-  // spills.
-  if (NewReservedRegs) {
-    for (Register Reg : FuncInfo->getWWMReservedRegs())
-      MRI.reserveReg(Reg, TRI);
-  }
+  // Updated the reserved registers with any VGPRs added for SGPR spills.
+  if (NewReservedRegs)
+    MRI.freezeReservedRegs(MF);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
deleted file mode 100644
index 9c3cd1bbd6b0..000000000000
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Lowering the WWM_COPY instructions for various register classes.
-/// AMDGPU target generates WWM_COPY instruction to differentiate WWM
-/// copy from COPY. This pass generates the necessary exec mask manipulation
-/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to
-/// COPY.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-lower-wwm-copies"
-
-namespace {
-
-class SILowerWWMCopies : public MachineFunctionPass {
-public:
-  static char ID;
-
-  SILowerWWMCopies() : MachineFunctionPass(ID) {
-    initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Lower WWM Copies"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  bool isSCCLiveAtMI(const MachineInstr &MI);
-  void addToWWMSpills(MachineFunction &MF, Register Reg);
-
-  LiveIntervals *LIS;
-  SlotIndexes *Indexes;
-  VirtRegMap *VRM;
-  const SIRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  SIMachineFunctionInfo *MFI;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false,
-                    false)
-
-char SILowerWWMCopies::ID = 0;
-
-char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID;
-
-bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) {
-  // We can't determine the liveness info if LIS isn't available. Early return
-  // in that case and always assume SCC is live.
-  if (!LIS)
-    return true;
-
-  LiveRange &LR =
-      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
-  SlotIndex Idx = LIS->getInstructionIndex(MI);
-  return LR.liveAt(Idx);
-}
-
-// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
-// for preserving its entire lanes at function prolog/epilog.
-void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) {
-  if (Reg.isPhysical())
-    return;
-
-  Register PhysReg = VRM->getPhys(Reg);
-  assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
-         "should have allocated a physical register");
-
-  MFI->allocateWWMSpill(MF, PhysReg);
-}
-
-bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-
-  MFI = MF.getInfo<SIMachineFunctionInfo>();
-  LIS = getAnalysisIfAvailable<LiveIntervals>();
-  Indexes = getAnalysisIfAvailable<SlotIndexes>();
-  VRM = getAnalysisIfAvailable<VirtRegMap>();
-  TRI = ST.getRegisterInfo();
-  MRI = &MF.getRegInfo();
-
-  if (!MFI->hasVRegFlags())
-    return false;
-
-  bool Changed = false;
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::WWM_COPY)
-        continue;
-
-      // TODO: Club adjacent WWM ops between same exec save/restore
-      assert(TII->isVGPRCopy(MI));
-
-      // For WWM vector copies, manipulate the exec mask around the copy
-      // instruction.
-      const DebugLoc &DL = MI.getDebugLoc();
-      MachineBasicBlock::iterator InsertPt = MI.getIterator();
-      Register RegForExecCopy = MFI->getSGPRForEXECCopy();
-      TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
-                                 isSCCLiveAtMI(MI), Indexes);
-      TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
-      addToWWMSpills(MF, MI.getOperand(0).getReg());
-      LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
-
-      // Lower WWM_COPY back to COPY
-      MI.setDesc(TII->get(AMDGPU::COPY));
-      Changed |= true;
-    }
-  }
-
-  return Changed;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 219464eac9ec..c9376d0ea653 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -314,23 +314,37 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
-bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
-    MachineFunction &MF, int FI, unsigned LaneIndex) {
+bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
+                                                      int FI,
+                                                      unsigned LaneIndex) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LaneVGPR;
   if (!LaneIndex) {
-    LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    if (LaneVGPR == AMDGPU::NoRegister) {
+      // We have no VGPRs left for spilling SGPRs. Reset because we will not
+      // partially spill the SGPR to VGPRs.
+      SGPRSpillToVGPRLanes.erase(FI);
+      return false;
+    }
+
     SpillVGPRs.push_back(LaneVGPR);
+    // Add this register as live-in to all blocks to avoid machine verifier
+    // complaining about use of an undefined physical register.
+    for (MachineBasicBlock &BB : MF)
+      BB.addLiveIn(LaneVGPR);
   } else {
     LaneVGPR = SpillVGPRs.back();
   }
 
-  SGPRSpillsToVirtualVGPRLanes[FI].push_back(
+  SGPRSpillToVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
 
-bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
+bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
     MachineFunction &MF, int FI, unsigned LaneIndex) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -341,21 +355,16 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
     if (LaneVGPR == AMDGPU::NoRegister) {
       // We have no VGPRs left for spilling SGPRs. Reset because we will not
       // partially spill the SGPR to VGPRs.
-      SGPRSpillsToPhysicalVGPRLanes.erase(FI);
+      PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
       return false;
     }
 
     allocateWWMSpill(MF, LaneVGPR);
-    reserveWWMRegister(LaneVGPR);
-    for (MachineBasicBlock &MBB : MF) {
-      MBB.addLiveIn(LaneVGPR);
-      MBB.sortUniqueLiveIns();
-    }
   } else {
-    LaneVGPR = WWMReservedRegs.back();
+    LaneVGPR = WWMSpills.back().first;
   }
 
-  SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
+  PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
@@ -364,8 +373,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
                                                         int FI,
                                                         bool IsPrologEpilog) {
   std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
-      IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
-                     : SGPRSpillsToVirtualVGPRLanes[FI];
+      IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
+                     : SGPRSpillToVGPRLanes[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -386,14 +395,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
          "not spilling SGPRs to VGPRs");
 
   unsigned &NumSpillLanes =
-      IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
+      IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
 
   for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
     unsigned LaneIndex = (NumSpillLanes % WaveSize);
 
-    bool Allocated = IsPrologEpilog
-                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
-                         : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
+    bool Allocated =
+        IsPrologEpilog
+            ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
+            : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
     if (!Allocated) {
       NumSpillLanes -= I;
       return false;
@@ -474,25 +484,16 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 
 bool SIMachineFunctionInfo::removeDeadFrameIndices(
     MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
-  // Remove dead frame indices from function frame, however keep FP & BP since
-  // spills for them haven't been inserted yet. And also make sure to remove the
-  // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
-  // otherwise, it could result in an unexpected side effect and bug, in case of
-  // any re-mapping of freed frame indices by later pass(es) like "stack slot
+  // Remove dead frame indices from function frame. And also make sure to remove
+  // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
+  // could result in an unexpected side effect and bug, in case of any
+  // re-mapping of freed frame indices by later pass(es) like "stack slot
   // coloring".
-  for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
+  for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
     MFI.RemoveStackObject(R.first);
-    SGPRSpillsToVirtualVGPRLanes.erase(R.first);
+    SGPRSpillToVGPRLanes.erase(R.first);
   }
 
-  // Remove the dead frame indices of CSR SGPRs which are spilled to physical
-  // VGPR lanes during SILowerSGPRSpills pass.
-  if (!ResetSGPRSpillStackIDs) {
-    for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
-      MFI.RemoveStackObject(R.first);
-      SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
-    }
-  }
   bool HaveSGPRToMemory = false;
 
   if (ResetSGPRSpillStackIDs) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 37572d30dff6..3b4747adf125 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -496,16 +496,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   };
 
 private:
-  // To track virtual VGPR + lane index for each subregister of the SGPR spilled
-  // to frameindex key during SILowerSGPRSpills pass.
+  // To track VGPR + lane index for each subregister of the SGPR spilled to
+  // frameindex key during SILowerSGPRSpills pass.
+  DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes;
+  // To track VGPR + lane index for spilling special SGPRs like Frame Pointer
+  // identified during PrologEpilogInserter.
   DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
-      SGPRSpillsToVirtualVGPRLanes;
-  // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs
-  // like Frame Pointer identified during PrologEpilogInserter.
-  DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
-      SGPRSpillsToPhysicalVGPRLanes;
-  unsigned NumVirtualVGPRSpillLanes = 0;
-  unsigned NumPhysicalVGPRSpillLanes = 0;
+      PrologEpilogSGPRSpillToVGPRLanes;
+  unsigned NumVGPRSpillLanes = 0;
+  unsigned NumVGPRPrologEpilogSpillLanes = 0;
   SmallVector<Register, 2> SpillVGPRs;
   using WWMSpillsMap = MapVector<Register, int>;
   // To track the registers used in instructions that can potentially modify the
@@ -549,10 +548,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 private:
   Register VGPRForAGPRCopy;
 
-  bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                        unsigned LaneIndex);
-  bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                         unsigned LaneIndex);
+  bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI,
+                                 unsigned LaneIndex);
+  bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI,
+                                             unsigned LaneIndex);
 
 public:
   Register getVGPRForAGPRCopy() const {
@@ -584,9 +583,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   SIModeRegisterDefaults getMode() const { return Mode; }
 
   ArrayRef<SIRegisterInfo::SpilledReg>
-  getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const {
-    auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex);
-    return (I == SGPRSpillsToVirtualVGPRLanes.end())
+  getSGPRSpillToVGPRLanes(int FrameIndex) const {
+    auto I = SGPRSpillToVGPRLanes.find(FrameIndex);
+    return (I == SGPRSpillToVGPRLanes.end())
                ? ArrayRef<SIRegisterInfo::SpilledReg>()
                : ArrayRef(I->second);
   }
@@ -648,9 +647,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   ArrayRef<SIRegisterInfo::SpilledReg>
-  getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const {
-    auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex);
-    return (I == SGPRSpillsToPhysicalVGPRLanes.end())
+  getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const {
+    auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex);
+    return (I == PrologEpilogSGPRSpillToVGPRLanes.end())
                ? ArrayRef<SIRegisterInfo::SpilledReg>()
                : ArrayRef(I->second);
   }
@@ -668,8 +667,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag;
   }
 
-  bool hasVRegFlags() { return VRegFlags.size(); }
-
   void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
                         Align Alignment = Align(4));
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 1d50dff4a7d9..c2a272166241 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -712,6 +712,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
     reserveRegisterTuples(Reserved, Reg);
 
+  for (auto Reg : MFI->getSGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
   return Reserved;
 }
 
@@ -1733,13 +1736,10 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
 
 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
                                RegScavenger *RS, SlotIndexes *Indexes,
-                               LiveIntervals *LIS, bool OnlyToVGPR,
-                               bool SpillToPhysVGPRLane) const {
+                               LiveIntervals *LIS, bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SpilledReg> VGPRSpills =
-      SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
-                          : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -1856,13 +1856,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
 
 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
                                  RegScavenger *RS, SlotIndexes *Indexes,
-                                 LiveIntervals *LIS, bool OnlyToVGPR,
-                                 bool SpillToPhysVGPRLane) const {
+                                 LiveIntervals *LIS, bool OnlyToVGPR) const {
   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
 
-  ArrayRef<SpilledReg> VGPRSpills =
-      SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
-                          : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
+  ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
   bool SpillToVGPR = !VGPRSpills.empty();
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
@@ -2008,7 +2005,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
 /// handled.
 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
     MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
-    SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
+    SlotIndexes *Indexes, LiveIntervals *LIS) const {
   switch (MI->getOpcode()) {
   case AMDGPU::SI_SPILL_S1024_SAVE:
   case AMDGPU::SI_SPILL_S512_SAVE:
@@ -2024,7 +2021,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S32_SAVE:
-    return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
+    return spillSGPR(MI, FI, RS, Indexes, LIS, true);
   case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S384_RESTORE:
@@ -2039,7 +2036,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S96_RESTORE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
-    return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
+    return restoreSGPR(MI, FI, RS, Indexes, LIS, true);
   default:
     llvm_unreachable("not an SGPR spill instruction");
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 17fce43891c5..2120b47c581e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -142,17 +142,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
                                bool IsLoad, bool IsKill = true) const;
 
-  /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a
-  /// free VGPR lane to spill.
+  /// If \p OnlyToVGPR is true, this will only succeed if this
   bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
                  SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-                 bool OnlyToVGPR = false,
-                 bool SpillToPhysVGPRLane = false) const;
+                 bool OnlyToVGPR = false) const;
 
   bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
                    SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-                   bool OnlyToVGPR = false,
-                   bool SpillToPhysVGPRLane = false) const;
+                   bool OnlyToVGPR = false) const;
 
   bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
                           MachineBasicBlock &RestoreMBB, Register SGPR,
@@ -166,10 +163,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  bool eliminateSGPRToVGPRSpillFrameIndex(
-      MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
-      SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
-      bool SpillToPhysVGPRLane = false) const;
+  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
+                                          int FI, RegScavenger *RS,
+                                          SlotIndexes *Indexes = nullptr,
+                                          LiveIntervals *LIS = nullptr) const;
 
   StringRef getRegAsmName(MCRegister Reg) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c2d7d605fbc2..444ee2efb6d2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -205,6 +205,8 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+      II.setMetadata(LLVMContext::MD_noundef,
+                     MDNode::get(II.getContext(), std::nullopt));
       return &II;
     }
     break;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index dae323ec24fb..8c642f61019a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1672,9 +1672,9 @@ getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
 
   // FIXME: Needs fixup support.
   unsigned Value = 0;
-  int32_t tmp = (int32_t)MO1.getImm();
-  if (tmp < 0)
-    tmp = abs(tmp);
+  auto tmp = static_cast<uint32_t>(MO1.getImm());
+  if (static_cast<int32_t>(tmp) < 0)
+    tmp = -tmp;
   else
     Value |= 256; // Set the ADD bit
   Value |= tmp & 255;
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index 088195994edd..67574403ca83 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -207,8 +207,32 @@ void BPFMISimplifyPatchable::processDstReg(MachineRegisterInfo *MRI,
   decltype(End) NextI;
   for (auto I = Begin; I != End; I = NextI) {
     NextI = std::next(I);
-    if (doSrcRegProp)
+    if (doSrcRegProp) {
+      // In situations like below it is not known if usage is a kill
+      // after setReg():
+      //
+      // .-> %2:gpr = LD_imm64 @"llvm.t:0:0$0:0"
+      // |
+      // |`----------------.
+      // |   %3:gpr = LDD %2:gpr, 0
+      // |   %4:gpr = ADD_rr %0:gpr(tied-def 0), killed %3:gpr <--- (1)
+      // |   %5:gpr = LDD killed %4:gpr, 0       ^^^^^^^^^^^^^
+      // |   STD killed %5:gpr, %1:gpr, 0         this is I
+      //  `----------------.
+      //     %6:gpr = LDD %2:gpr, 0
+      //     %7:gpr = ADD_rr %0:gpr(tied-def 0), killed %6:gpr <--- (2)
+      //     %8:gpr = LDD killed %7:gpr, 0       ^^^^^^^^^^^^^
+      //     STD killed %8:gpr, %1:gpr, 0         this is I
+      //
+      // Instructions (1) and (2) would be updated by setReg() to:
+      //
+      //     ADD_rr %0:gpr(tied-def 0), %2:gpr
+      //
+      // %2:gpr is not killed at (1), so it is necessary to remove kill flag
+      // from I.
       I->setReg(SrcReg);
+      I->setIsKill(false);
+    }
 
     // The candidate needs to have a unique definition.
     if (IsAma && MRI->getUniqueVRegDef(I->getReg()))
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 485ba88a4654..3c1422b0e1a2 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1368,6 +1368,8 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
     // been generated, construct one based on function signature.
     if (LineInfoGenerated == false) {
       auto *S = MI->getMF()->getFunction().getSubprogram();
+      if (!S)
+        return;
       MCSymbol *FuncLabel = Asm->getFunctionBegin();
       constructLineInfo(S, FuncLabel, S->getLine(), 0);
       LineInfoGenerated = true;
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 56fdf19a0720..4f93cdaaa137 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1544,24 +1544,58 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
   Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
 
   // Check whether this is a TLS call expression
-  bool TLSCall = false;
-  if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal))
-    TLSCall = Ref->getSymbol().getName() == "__tls_get_addr";
+  const char TlsGetAddr[] = "__tls_get_addr";
+  bool TlsCall = false;
+  const MCExpr *TlsCallAddend = nullptr;
+  if (auto *Ref = dyn_cast<MCSymbolRefExpr>(EVal)) {
+    TlsCall = Ref->getSymbol().getName() == TlsGetAddr;
+  } else if (auto *Bin = dyn_cast<MCBinaryExpr>(EVal);
+             Bin && Bin->getOpcode() == MCBinaryExpr::Add) {
+    if (auto *Ref = dyn_cast<MCSymbolRefExpr>(Bin->getLHS())) {
+      TlsCall = Ref->getSymbol().getName() == TlsGetAddr;
+      TlsCallAddend = Bin->getRHS();
+    }
+  }
 
-  if (TLSCall && parseOptionalToken(AsmToken::LParen)) {
+  if (TlsCall && parseOptionalToken(AsmToken::LParen)) {
     const MCExpr *TLSSym;
-    S = Parser.getTok().getLoc();
+    const SMLoc S2 = Parser.getTok().getLoc();
     if (ParseExpression(TLSSym))
-      return Error(S, "invalid TLS call expression");
+      return Error(S2, "invalid TLS call expression");
+    E = Parser.getTok().getLoc();
     if (parseToken(AsmToken::RParen, "expected ')'"))
       return true;
-    E = Parser.getTok().getLoc();
+    // PPC32 allows bl __tls_get_addr[+a](x@tlsgd)@plt+b. Parse "@plt[+b]".
+    if (!isPPC64() && parseOptionalToken(AsmToken::At)) {
+      AsmToken Tok = getTok();
+      if (!(parseOptionalToken(AsmToken::Identifier) &&
+            Tok.getString().compare_insensitive("plt") == 0))
+        return Error(Tok.getLoc(), "expected 'plt'");
+      EVal = MCSymbolRefExpr::create(TlsGetAddr, MCSymbolRefExpr::VK_PLT,
+                                     getContext());
+      if (parseOptionalToken(AsmToken::Plus)) {
+        const MCExpr *Addend = nullptr;
+        SMLoc EndLoc;
+        if (parsePrimaryExpr(Addend, EndLoc))
+          return true;
+        if (TlsCallAddend) // __tls_get_addr+a(x@tlsgd)@plt+b
+          TlsCallAddend =
+              MCBinaryExpr::createAdd(TlsCallAddend, Addend, getContext());
+        else // __tls_get_addr(x@tlsgd)@plt+b
+          TlsCallAddend = Addend;
+      }
+      if (TlsCallAddend)
+        EVal = MCBinaryExpr::createAdd(EVal, TlsCallAddend, getContext());
+      // Add a __tls_get_addr operand with addend a, b, or a+b.
+      Operands.back() = PPCOperand::CreateFromMCExpr(
+          EVal, S, Parser.getTok().getLoc(), false);
+    }
 
     Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
   }
 
   // Otherwise, check for D-form memory operands
-  if (!TLSCall && parseOptionalToken(AsmToken::LParen)) {
+  if (!TlsCall && parseOptionalToken(AsmToken::LParen)) {
     S = Parser.getTok().getLoc();
 
     int64_t IntVal;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index dbdfb6e906bb..13480da4e731 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -564,10 +564,10 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   // come at the _end_ of the expression.
   const MCOperand &Op = MI->getOperand(OpNo);
   const MCSymbolRefExpr *RefExp = nullptr;
-  const MCConstantExpr *ConstExp = nullptr;
+  const MCExpr *Rhs = nullptr;
   if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Op.getExpr())) {
     RefExp = cast<MCSymbolRefExpr>(BinExpr->getLHS());
-    ConstExp = cast<MCConstantExpr>(BinExpr->getRHS());
+    Rhs = BinExpr->getRHS();
   } else
     RefExp = cast<MCSymbolRefExpr>(Op.getExpr());
 
@@ -584,8 +584,14 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   if (RefExp->getKind() != MCSymbolRefExpr::VK_None &&
       RefExp->getKind() != MCSymbolRefExpr::VK_PPC_NOTOC)
     O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
-  if (ConstExp != nullptr)
-    O << '+' << ConstExp->getValue();
+  if (Rhs) {
+    SmallString<0> Buf;
+    raw_svector_ostream Tmp(Buf);
+    Rhs->print(Tmp, &MAI);
+    if (isdigit(Buf[0]))
+      O << '+';
+    O << Buf;
+  }
 }
 
 /// showRegistersWithPercentPrefix - Check if this register name should be
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 0ebfc007b3d7..96fd83ab6a7b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -7633,20 +7633,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     case PPC::ADDItocL:
       Flags = PPCII::MO_TOC_LO;
       break;
-    case PPC::ADDItoc:
-    case PPC::ADDItoc8:
-      if (RequiresMod4Offset) {
-        if (GlobalAddressSDNode *GA =
-                dyn_cast<GlobalAddressSDNode>(Base.getOperand(0))) {
-          const GlobalValue *GV = GA->getGlobal();
-          Align Alignment = GV->getPointerAlignment(CurDAG->getDataLayout());
-          // XMC_TD global that is underaligned being accessed with a DS form
-          // instruction.
-          if (Alignment < 4)
-            continue;
-        }
-      }
-      break;
     }
 
     SDValue ImmOpnd = Base.getOperand(1);
@@ -7741,27 +7727,12 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       }
     }
 
-    const unsigned BaseOpcode = Base.getMachineOpcode();
-    // ADDItoc and ADDItoc8 are pseudos used exclusively by AIX small code
-    // model when a global is defined in the TOC.
-    const bool OpcodeIsAIXTocData =
-        BaseOpcode == PPC::ADDItoc || BaseOpcode == PPC::ADDItoc8;
-
     if (FirstOp == 1) // Store
-      if (OpcodeIsAIXTocData)
-        (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0),
-                                         Base.getOperand(0), Base.getOperand(1),
-                                         N->getOperand(3));
-      else
-        (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
-                                         Base.getOperand(0), N->getOperand(3));
+      (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+                                       Base.getOperand(0), N->getOperand(3));
     else // Load
-      if (OpcodeIsAIXTocData)
-        (void)CurDAG->UpdateNodeOperands(N, Base.getOperand(0),
-                                         Base.getOperand(1), N->getOperand(2));
-      else
-        (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
-                                         N->getOperand(2));
+      (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+                                       N->getOperand(2));
 
     if (UpdateHBase)
       (void)CurDAG->UpdateNodeOperands(HBase.getNode(), HBase.getOperand(0),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 0081c0f5295a..224c7b281ac4 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -461,6 +461,12 @@ class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
   let RST = 0;
 }
 
+class XForm_tlbilx<bits<10> xo, dag OOL, dag IOL, string asmstr,
+      InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
+  bits<5> T;
+  let RST = T;
+}
+
 class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                  InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 616f4e48cfb8..2992f78aa38a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4317,6 +4317,9 @@ def TLBSX : XForm_tlb<914, (outs), (ins gprc:$RA, gprc:$RB), "tlbsx $RA, $RB",
 def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$RA, gprc:$RB), "tlbivax $RA, $RB",
                 IIC_LdStLoad>, Requires<[IsBookE]>;
 
+def TLBILX : XForm_tlbilx<18, (outs), (ins u2imm:$T, gprc:$RA, gprc:$RB),
+    "tlbilx $T, $RA, $RB", IIC_LdStLoad>, Requires<[IsBookE]>;
+
 def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
                            "tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;
 
@@ -4680,6 +4683,12 @@ def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
 def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
                 Requires<[IsPPC4xx]>;
 
+def : InstAlias<"tlbilxlpid", (TLBILX 0, R0, R0)>, Requires<[IsBookE]>;
+def : InstAlias<"tlbilxpid", (TLBILX 1, R0, R0)>, Requires<[IsBookE]>;
+def : InstAlias<"tlbilxva $RA, $RB", (TLBILX 3, gprc:$RA, gprc:$RB)>,
+                Requires<[IsBookE]>;
+def : InstAlias<"tlbilxva $RB", (TLBILX 3, R0, gprc:$RB)>, Requires<[IsBookE]>;
+
 def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
 
 def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 1f7dba66db35..976effb96adc 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -42,10 +42,6 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
     Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
   } else {
     const GlobalValue *GV = MO.getGlobal();
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->hasAttribute("toc-data"))
-        return TM.getSymbol(GV);
-
     TM.getNameWithPrefix(Name, GV, Mang);
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index e9f4daa62de3..b763191d980e 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -42,7 +42,7 @@ def P9Model : SchedMachineModel {
   // Power 9, or MMA, or paired vector mem ops,  or PC relative mem ops, or
   // instructions introduced after ISA 3.0.
   let UnsupportedFeatures = [HasSPE, PrefixInstrs, MMA,
-                             PairedVectorMemops,
+                             PairedVectorMemops, IsBookE,
                              PCRelativeMemops, IsISA3_1, IsISAFuture];
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index f7d11e921c7d..d2520d932ddf 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -225,16 +225,23 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
   const MachineOperand &AddrReg = MI->getOperand(OpNo);
   assert(MI->getNumOperands() > OpNo + 1 && "Expected additional operand");
-  const MachineOperand &DispImm = MI->getOperand(OpNo + 1);
+  const MachineOperand &Offset = MI->getOperand(OpNo + 1);
   // All memory operands should have a register and an immediate operand (see
   // RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand).
   if (!AddrReg.isReg())
     return true;
-  if (!DispImm.isImm())
+  if (!Offset.isImm() && !Offset.isGlobal())
     return true;
 
-  OS << DispImm.getImm() << "("
-     << RISCVInstPrinter::getRegisterName(AddrReg.getReg()) << ")";
+  MCOperand MCO;
+  if (!lowerOperand(Offset, MCO))
+    return true;
+
+  if (Offset.isImm())
+    OS << MCO.getImm();
+  else if (Offset.isGlobal())
+    OS << *MCO.getExpr();
+  OS << "(" << RISCVInstPrinter::getRegisterName(AddrReg.getReg()) << ")";
   return false;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 59f1e8319ae7..d10bba26023f 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -572,6 +572,15 @@ bool tryToFoldBNEOnCmpXchgResult(MachineBasicBlock &MBB,
   if (!(BNEOp0 == DestReg && BNEOp1 == CmpValReg) &&
       !(BNEOp0 == CmpValReg && BNEOp1 == DestReg))
     return false;
+
+  // Make sure the branch is the only user of the AND.
+  if (MaskReg.isValid()) {
+    if (BNEOp0 == DestReg && !MBBI->getOperand(0).isKill())
+      return false;
+    if (BNEOp1 == DestReg && !MBBI->getOperand(1).isKill())
+      return false;
+  }
+
   ToErase.push_back(&*MBBI);
   LoopHeadBNETarget = MBBI->getOperand(2).getMBB();
   MBBI = skipDebugInstructionsForward(std::next(MBBI), E);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index ca2d9474d1ed..f312cc8129dd 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -262,22 +262,16 @@ static unsigned getPushPopEncoding(const Register MaxReg) {
 
 // Get the max reg of Push/Pop for restoring callee saved registers.
 static Register getMaxPushPopReg(const MachineFunction &MF,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 unsigned &PushPopRegs) {
+                                 const std::vector<CalleeSavedInfo> &CSI) {
   Register MaxPushPopReg = RISCV::NoRegister;
-  PushPopRegs = 0;
   for (auto &CS : CSI) {
     Register Reg = CS.getReg();
-    if (RISCV::PGPRRegClass.contains(Reg)) {
+    if (RISCV::PGPRRegClass.contains(Reg))
       MaxPushPopReg = std::max(MaxPushPopReg.id(), Reg.id());
-      PushPopRegs += 1;
-    }
   }
   // if rlist is {rs, s0-s10}, then s11 will also be included
-  if (MaxPushPopReg == RISCV::X26) {
+  if (MaxPushPopReg == RISCV::X26)
     MaxPushPopReg = RISCV::X27;
-    PushPopRegs = 13;
-  }
   return MaxPushPopReg;
 }
 
@@ -581,11 +575,18 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     int64_t Offset;
     // Offsets for objects with fixed locations (IE: those saved by libcall) are
     // simply calculated from the frame index.
-    if (FrameIdx < 0)
-      Offset = FrameIdx * (int64_t) STI.getXLen() / 8;
-    else
+    if (FrameIdx < 0) {
+      if (RVFI->isPushable(MF)) {
+        // Callee-saved register stored by Zcmp push is in reverse order.
+        Offset = -(FrameIdx + RVFI->getRVPushRegs() + 1) *
+                 (int64_t)STI.getXLen() / 8;
+      } else {
+        Offset = FrameIdx * (int64_t)STI.getXLen() / 8;
+      }
+    } else {
       Offset = MFI.getObjectOffset(Entry.getFrameIdx()) -
                RVFI->getLibCallStackSize();
+    }
     Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, RI->getDwarfRegNum(Reg, true), Offset));
@@ -771,7 +772,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FirstSPAdjustAmount)
     StackSize = FirstSPAdjustAmount;
 
-  if (RVFI->isPushable(MF) && MBBI->getOpcode() == RISCV::CM_POP) {
+  if (RVFI->isPushable(MF) && MBBI != MBB.end() &&
+      MBBI->getOpcode() == RISCV::CM_POP) {
     // Use available stack adjustment in pop instruction to deallocate stack
     // space.
     unsigned PushStack = RVFI->getRVPushRegs() * (STI.getXLen() / 8);
@@ -1325,10 +1327,11 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
   // Emit CM.PUSH with base SPimm & evaluate Push stack
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   if (RVFI->isPushable(*MF)) {
-    unsigned PushPopRegs = 0;
-    Register MaxReg = getMaxPushPopReg(*MF, CSI, PushPopRegs);
-    RVFI->setRVPushRegs(PushPopRegs);
-    RVFI->setRVPushStackSize(alignTo((STI.getXLen() / 8) * PushPopRegs, 16));
+    Register MaxReg = getMaxPushPopReg(*MF, CSI);
+    unsigned PushedRegNum =
+        getPushPopEncoding(MaxReg) - llvm::RISCVZC::RLISTENCODE::RA + 1;
+    RVFI->setRVPushRegs(PushedRegNum);
+    RVFI->setRVPushStackSize(alignTo((STI.getXLen() / 8) * PushedRegNum, 16));
 
     if (MaxReg != RISCV::NoRegister) {
       // Use encoded number to represent registers to spill.
@@ -1340,7 +1343,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
       PushBuilder.addImm((int64_t)RegEnc);
       PushBuilder.addImm(0);
 
-      for (unsigned i = 0; i < PushPopRegs; i++)
+      for (unsigned i = 0; i < PushedRegNum; i++)
         PushBuilder.addUse(AllPopRegs[i], RegState::Implicit);
     }
   } else if (const char *SpillLibCall = getSpillLibCallName(*MF, CSI)) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cafce628cf6a..aa20409da4e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3414,6 +3414,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
 
   // Because N and True must have the same merge operand (or True's operand is
   // implicit_def), the "effective" body is the minimum of their VLs.
+  SDValue OrigVL = VL;
   VL = GetMinVL(TrueVL, VL);
   if (!VL)
     return false;
@@ -3461,7 +3462,17 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
          "Expected instructions with mask have a tied dest.");
 #endif
 
-  uint64_t Policy = isImplicitDef(Merge) ? RISCVII::TAIL_AGNOSTIC : /*TUMU*/ 0;
+  // Use a tumu policy, relaxing it to tail agnostic provided that the merge
+  // operand is undefined.
+  //
+  // However, if the VL became smaller than what the vmerge had originally, then
+  // elements past VL that were previously in the vmerge's body will have moved
+  // to the tail. In that case we always need to use tail undisturbed to
+  // preserve them.
+  bool MergeVLShrunk = VL != OrigVL;
+  uint64_t Policy = (isImplicitDef(Merge) && !MergeVLShrunk)
+                        ? RISCVII::TAIL_AGNOSTIC
+                        : /*TUMU*/ 0;
   SDValue PolicyOp =
     CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT());
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f49c5011607f..f030982cb815 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3113,12 +3113,13 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     bool Negate = false;
     int64_t SplatStepVal = StepNumerator;
     unsigned StepOpcode = ISD::MUL;
-    if (StepNumerator != 1) {
-      if (isPowerOf2_64(std::abs(StepNumerator))) {
-        Negate = StepNumerator < 0;
-        StepOpcode = ISD::SHL;
-        SplatStepVal = Log2_64(std::abs(StepNumerator));
-      }
+    // Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
+    // anyway as the shift of 63 won't fit in uimm5.
+    if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
+        isPowerOf2_64(std::abs(StepNumerator))) {
+      Negate = StepNumerator < 0;
+      StepOpcode = ISD::SHL;
+      SplatStepVal = Log2_64(std::abs(StepNumerator));
     }
 
     // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
@@ -5368,9 +5369,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       if (isa<ConstantSDNode>(RHS)) {
         int64_t Imm = cast<ConstantSDNode>(RHS)->getSExtValue();
         if (Imm != 0 && isInt<12>((uint64_t)Imm + 1)) {
-          // X > -1 should have been replaced with false.
-          assert((CCVal != ISD::SETUGT || Imm != -1) &&
-                 "Missing canonicalization");
+          // If this is an unsigned compare and the constant is -1, incrementing
+          // the constant would change behavior. The result should be false.
+          if (CCVal == ISD::SETUGT && Imm == -1)
+            return DAG.getConstant(0, DL, VT);
           // Using getSetCCSwappedOperands will convert SET(U)GT->SET(U)LT.
           CCVal = ISD::getSetCCSwappedOperands(CCVal);
           SDValue SetCC = DAG.getSetCC(
@@ -11710,7 +11712,11 @@ static SDValue performFP_TO_INTCombine(SDNode *N,
     return SDValue();
 
   RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src.getOpcode());
-  if (FRM == RISCVFPRndMode::Invalid)
+  // If the result is invalid, we didn't find a foldable instruction.
+  // If the result is dynamic, then we found an frint which we don't yet
+  // support. It will cause 7 to be written to the FRM CSR for vector.
+  // FIXME: We could support this by using VFCVT_X_F_VL/VFCVT_XU_F_VL below.
+  if (FRM == RISCVFPRndMode::Invalid || FRM == RISCVFPRndMode::DYN)
     return SDValue();
 
   SDLoc DL(N);
diff --git a/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
index a93e750eadc6..f885adca669f 100644
--- a/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
@@ -132,7 +132,8 @@ bool RISCVPushPopOpt::runOnMachineFunction(MachineFunction &Fn) {
   for (auto &MBB : Fn) {
     MachineBasicBlock::iterator MBBI = containsPop(MBB);
     MachineBasicBlock::iterator NextI = next_nodbg(MBBI, MBB.end());
-    if (MBBI != MBB.end() && NextI->getOpcode() == RISCV::PseudoRET)
+    if (MBBI != MBB.end() && NextI != MBB.end() &&
+        NextI->getOpcode() == RISCV::PseudoRET)
       Modified |= usePopRet(MBBI, NextI, adjustRetVal(MBBI));
   }
   return Modified;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 3d602e7e4376..9af8b17edcc5 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -1743,6 +1743,22 @@ let hasSideEffects = 1 in {
 }
 }
 
+// Section A.11 - DONE and RETRY
+// Section A.47 - SAVED and RESTORED
+let Predicates = [HasV9], rs1 = 0, rs2 = 0 in {
+  let rd = 0 in
+    def DONE : F3_1<2, 0b111110, (outs), (ins), "done", []>;
+
+  let rd = 1 in
+    def RETRY : F3_1<2, 0b111110, (outs), (ins), "retry", []>;
+
+  let rd = 0 in
+    def SAVED : F3_1<2, 0b110001, (outs), (ins), "saved", []>;
+
+  let rd = 1 in
+    def RESTORED : F3_1<2, 0b110001, (outs), (ins), "restored", []>;
+}
+
 // Section A.42 - Prefetch Data
 let Predicates = [HasV9] in {
   def PREFETCHr : F3_1<3, 0b101101,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 821efc1b758b..abac7a9bfe0a 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1152,6 +1152,11 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     }
   }
 
+  // Type legalization (via getNumberOfParts) can't handle structs
+  if (TLI->getValueType(DL, Src, true) == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind);
+
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 0f677b8a4afc..05cc50712c52 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -659,6 +659,13 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+def TuningPreferNoGather
+    : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
+                       "Prefer no gather instructions">;
+def TuningPreferNoScatter
+    : SubtargetFeature<"prefer-no-scatter", "PreferScatter", "false",
+                       "Prefer no scatter instructions">;
+
 def TuningPrefer128Bit
     : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
                        "Prefer 128-bit AVX instructions">;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c14d51bb4fa5..d9750ea22e2b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1648,7 +1648,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_ROUND,           VT, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
     }
-    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
       setOperationAction(ISD::FP_EXTEND,          VT, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
     }
@@ -1656,9 +1656,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
-
-    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1975,8 +1972,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setF16Action(MVT::v32f16, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
-    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
@@ -2197,9 +2194,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
       setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
 
@@ -2249,9 +2246,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
@@ -2275,8 +2272,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   if (!Subtarget.useSoftFloat() &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
-    addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
-    addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
+    addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
+                                                        : &X86::VR128RegClass);
+    addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
+                                                         : &X86::VR256RegClass);
     // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
     // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
     // Set the operation action Custom to do the customization later.
@@ -2291,6 +2290,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
+    setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
   }
 
@@ -2302,6 +2302,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
     setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
   }
 
@@ -11363,7 +11364,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
 
-  if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
@@ -14795,13 +14797,9 @@ static bool isShuffleFoldableLoad(SDValue V) {
 }
 
 template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
-  return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
-  return ::isSoftFP16(VT, Subtarget);
+static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
+  T EltVT = VT.getScalarType();
+  return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
@@ -14817,7 +14815,7 @@ static SDValue lowerShuffleAsElementInsertion(
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltBits = VT.getScalarSizeInBits();
 
-  if (isSoftFP16(EltVT, Subtarget))
+  if (isSoftF16(EltVT, Subtarget))
     return SDValue();
 
   int V2Index =
@@ -20374,7 +20372,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeVectorElementTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
                                           DAG.getBitcast(NVT, LHS),
@@ -21852,7 +21850,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (isSoftFP16(VT))
+  if (isSoftF16(VT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, true, Subtarget))
     return Op;
@@ -22357,7 +22355,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DstVT == MVT::f128)
     return SDValue();
 
-  if (isSoftFP16(DstVT))
+  if (isSoftF16(DstVT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, false, Subtarget))
     return Op;
@@ -23314,7 +23312,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   SDValue Res;
-  if (isSoftFP16(SrcVT)) {
+  if (isSoftF16(SrcVT, Subtarget)) {
     MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
     if (IsStrict)
       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
@@ -23743,7 +23741,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
-  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
+  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
     return SDValue();
 
   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -23888,6 +23886,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
                           !Subtarget.getTargetTriple().isOSDarwin()))
     return SDValue();
 
+  if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
+      (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
+    return Op;
+
   if (SVT == MVT::f16) {
     if (Subtarget.hasFP16())
       return Op;
@@ -23960,7 +23962,25 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   if (!SVT.isVector())
     return Op;
 
+  if (SVT.getVectorElementType() == MVT::bf16) {
+    // FIXME: Do we need to support strict FP?
+    assert(!IsStrict && "Strict FP doesn't support BF16");
+    if (VT.getVectorElementType() == MVT::f64) {
+      MVT TmpVT = VT.changeVectorElementType(MVT::f32);
+      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+                         DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
+    }
+    assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
+    MVT NVT = SVT.changeVectorElementType(MVT::i32);
+    In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
+    In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
+    In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
+    return DAG.getBitcast(VT, In);
+  }
+
   if (SVT.getVectorElementType() == MVT::f16) {
+    if (Subtarget.hasFP16() && isTypeLegal(SVT))
+      return Op;
     assert(Subtarget.hasF16C() && "Unexpected features!");
     if (SVT == MVT::v2f16)
       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
@@ -24033,6 +24053,12 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     return Res;
   }
 
+  if (VT.getScalarType() == MVT::bf16) {
+    if (SVT.getScalarType() == MVT::f32 && isTypeLegal(VT))
+      return Op;
+    return SDValue();
+  }
+
   if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
     if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
       return SDValue();
@@ -25676,7 +25702,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (isFP) {
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-    if (isSoftFP16(EltVT, Subtarget))
+    if (isSoftF16(EltVT, Subtarget))
       return SDValue();
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
@@ -26241,7 +26267,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC =
       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
 
-  if (isSoftFP16(Op0.getValueType()))
+  if (isSoftF16(Op0.getValueType(), Subtarget))
     return SDValue();
 
   // Handle f128 first, since one possible outcome is a normal integer
@@ -26434,7 +26460,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
                                           DAG.getBitcast(NVT, Op1),
@@ -26506,7 +26532,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (Cond.getOpcode() == ISD::SETCC &&
-      !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
+      !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
@@ -27196,7 +27222,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // Bail out when we don't have native compare instructions.
   if (Cond.getOpcode() == ISD::SETCC &&
       Cond.getOperand(0).getValueType() != MVT::f128 &&
-      !isSoftFP16(Cond.getOperand(0).getValueType())) {
+      !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -34983,7 +35009,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SrcVT = Src.getValueType();
 
     SDValue Res;
-    if (isSoftFP16(SrcVT)) {
+    if (isSoftF16(SrcVT, Subtarget)) {
       EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
       if (IsStrict) {
         Res =
@@ -47383,7 +47409,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
+      VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
@@ -47700,7 +47726,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   }
 
   // Early exit check
-  if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
+  if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
     return SDValue();
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -54550,7 +54576,7 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
+  if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 250df82a30c2..047d8f021047 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1749,8 +1749,6 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
-    template<typename T> bool isSoftFP16(T VT) const;
-
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index ecb5c3e91240..b5dac7a0c65a 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12976,6 +12976,11 @@ let Predicates = [HasBF16, HasVLX] in {
   def : Pat<(v16bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZ256rr VR128X:$src)>;
 
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256X:$src))),
+            (VCVTNEPS2BF16Z256rr VR256X:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Z256rm addr:$src)>;
+
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
@@ -12985,6 +12990,11 @@ let Predicates = [HasBF16] in {
 
   def : Pat<(v32bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZrr VR128X:$src)>;
+
+  def : Pat<(v16bf16 (X86vfpround (v16f32 VR512:$src))),
+            (VCVTNEPS2BF16Zrr VR512:$src)>;
+  def : Pat<(v16bf16 (X86vfpround (loadv16f32 addr:$src))),
+            (VCVTNEPS2BF16Zrm addr:$src)>;
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 6c57eceab376..a6fcc804e1d0 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8289,6 +8289,11 @@ let Predicates = [HasAVXNECONVERT] in {
        f256mem>, T8PS;
   let checkVEXPredicate = 1 in
   defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
+
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
+            (VCVTNEPS2BF16Yrr VR256:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Yrm addr:$src)>;
 }
 
 def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 17981b3b9374..129a2646dbb7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5944,9 +5944,7 @@ bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
          (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
 }
 
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
-  if (!supportsGather())
-    return false;
+bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) {
   Type *ScalarTy = DataTy->getScalarType();
   if (ScalarTy->isPointerTy())
     return true;
@@ -5961,6 +5959,12 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   return IntWidth == 32 || IntWidth == 64;
 }
 
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+  if (!supportsGather() || !ST->preferGather())
+    return false;
+  return isLegalMaskedGatherScatter(DataTy, Alignment);
+}
+
 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
                                  unsigned Opcode1,
                                  const SmallBitVector &OpcodeMask) const {
@@ -5996,9 +6000,9 @@ bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
 
 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
   // AVX2 doesn't support scatter
-  if (!ST->hasAVX512())
+  if (!ST->hasAVX512() || !ST->preferScatter())
     return false;
-  return isLegalMaskedGather(DataType, Alignment);
+  return isLegalMaskedGatherScatter(DataType, Alignment);
 }
 
 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 89c7916260a4..0fa0d240a548 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -261,6 +261,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
     return forceScalarizeMaskedGather(VTy, Alignment);
   }
+  bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedGather(Type *DataType, Align Alignment);
   bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 518c859b11cc..81309280a44b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1241,8 +1241,11 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512VP2INTERSECT);
 
+  // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
+  // return all 0s for invalid subleaves so check the limit.
   bool HasLeaf7Subleaf1 =
-      MaxLeaf >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+      HasLeaf7 && EAX >= 1 &&
+      !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
   if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512BF16);
 
@@ -1750,8 +1753,11 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512fp16"] = HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save;
   Features["amx-tile"]   = HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave;
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
+  // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
+  // return all 0s for invalid subleaves so check the limit.
   bool HasLeaf7Subleaf1 =
-      MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+      HasLeaf7 && EAX >= 1 &&
+      !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
   Features["sha512"]     = HasLeaf7Subleaf1 && ((EAX >> 0) & 1);
   Features["sm3"]        = HasLeaf7Subleaf1 && ((EAX >> 1) & 1);
   Features["sm4"]        = HasLeaf7Subleaf1 && ((EAX >> 2) & 1);
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 72781513ff12..772d24c5ce3d 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -16,9 +16,6 @@
 using namespace llvm;
 using namespace llvm::LoongArch;
 
-StringRef Arch;
-StringRef TuneCPU;
-
 const FeatureInfo AllFeatures[] = {
 #define LOONGARCH_FEATURE(NAME, KIND) {NAME, KIND},
 #include "llvm/TargetParser/LoongArchTargetParser.def"
@@ -50,11 +47,9 @@ bool LoongArch::getArchFeatures(StringRef Arch,
   return false;
 }
 
-bool LoongArch::isValidTuneCPUName(StringRef TuneCPU) {
-  return isValidArchName(TuneCPU);
-}
+bool LoongArch::isValidCPUName(StringRef Name) { return isValidArchName(Name); }
 
-void LoongArch::fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) {
+void LoongArch::fillValidCPUList(SmallVectorImpl<StringRef> &Values) {
   for (const auto A : AllArchs)
     Values.emplace_back(A.Name);
 }
@@ -63,11 +58,3 @@ StringRef LoongArch::getDefaultArch(bool Is64Bit) {
   // TODO: use a real 32-bit arch name.
   return Is64Bit ? "loongarch64" : "";
 }
-
-void LoongArch::setArch(StringRef Name) { Arch = Name; }
-
-StringRef LoongArch::getArch() { return Arch; }
-
-void LoongArch::setTuneCPU(StringRef Name) { TuneCPU = Name; }
-
-StringRef LoongArch::getTuneCPU() { return TuneCPU; }
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 34c8a380448e..503ce019dc84 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -29,7 +28,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -398,6 +396,54 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
   return true;
 }
 
+/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
+/// pessimistic codegen that has to account for setting errno and can enable
+/// vectorization.
+static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+                     TargetLibraryInfo &TLI, AssumptionCache &AC,
+                     DominatorTree &DT) {
+  // Match a call to sqrt mathlib function.
+  auto *Call = dyn_cast<CallInst>(&I);
+  if (!Call)
+    return false;
+
+  Module *M = Call->getModule();
+  LibFunc Func;
+  if (!TLI.getLibFunc(*Call, Func) || !isLibFuncEmittable(M, &TLI, Func))
+    return false;
+
+  if (Func != LibFunc_sqrt && Func != LibFunc_sqrtf && Func != LibFunc_sqrtl)
+    return false;
+
+  // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
+  // (because NNAN or the operand arg must not be less than -0.0) and (2) we
+  // would not end up lowering to a libcall anyway (which could change the value
+  // of errno), then:
+  // (1) errno won't be set.
+  // (2) it is safe to convert this to an intrinsic call.
+  Type *Ty = Call->getType();
+  Value *Arg = Call->getArgOperand(0);
+  if (TTI.haveFastSqrt(Ty) &&
+      (Call->hasNoNaNs() ||
+       cannotBeOrderedLessThanZero(Arg, M->getDataLayout(), &TLI, 0, &AC, &I,
+                                   &DT))) {
+    IRBuilder<> Builder(&I);
+    IRBuilderBase::FastMathFlagGuard Guard(Builder);
+    Builder.setFastMathFlags(Call->getFastMathFlags());
+
+    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
+    Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
+    I.replaceAllUsesWith(NewSqrt);
+
+    // Explicitly erase the old call because a call with side effects is not
+    // trivially dead.
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
 // the numbers from 0 to \p InputBits that should represent cttz results.
@@ -869,159 +915,13 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
-/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
-/// pessimistic codegen that has to account for setting errno and can enable
-/// vectorization.
-static bool foldSqrt(CallInst *Call, TargetTransformInfo &TTI,
-                     TargetLibraryInfo &TLI, AssumptionCache &AC,
-                     DominatorTree &DT) {
-  Module *M = Call->getModule();
-
-  // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
-  // (because NNAN or the operand arg must not be less than -0.0) and (2) we
-  // would not end up lowering to a libcall anyway (which could change the value
-  // of errno), then:
-  // (1) errno won't be set.
-  // (2) it is safe to convert this to an intrinsic call.
-  Type *Ty = Call->getType();
-  Value *Arg = Call->getArgOperand(0);
-  if (TTI.haveFastSqrt(Ty) &&
-      (Call->hasNoNaNs() ||
-       cannotBeOrderedLessThanZero(Arg, M->getDataLayout(), &TLI, 0, &AC, Call,
-                                   &DT))) {
-    IRBuilder<> Builder(Call);
-    IRBuilderBase::FastMathFlagGuard Guard(Builder);
-    Builder.setFastMathFlags(Call->getFastMathFlags());
-
-    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
-    Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
-    Call->replaceAllUsesWith(NewSqrt);
-
-    // Explicitly erase the old call because a call with side effects is not
-    // trivially dead.
-    Call->eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// Try to expand strcmp(P, "x") calls.
-static bool expandStrcmp(CallInst *CI, DominatorTree &DT, bool &MadeCFGChange) {
-  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-
-  // Trivial cases are optimized during inst combine
-  if (Str1P == Str2P)
-    return false;
-
-  StringRef Str1, Str2;
-  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-  Value *NonConstantP = nullptr;
-  StringRef ConstantStr;
-
-  if (!HasStr1 && HasStr2 && Str2.size() == 1) {
-    NonConstantP = Str1P;
-    ConstantStr = Str2;
-  } else if (!HasStr2 && HasStr1 && Str1.size() == 1) {
-    NonConstantP = Str2P;
-    ConstantStr = Str1;
-  } else {
-    return false;
-  }
-
-  // Check if strcmp result is only used in a comparison with zero
-  if (!isOnlyUsedInZeroComparison(CI))
-    return false;
-
-  // For strcmp(P, "x") do the following transformation:
-  //
-  // (before)
-  // dst = strcmp(P, "x")
-  //
-  // (after)
-  // v0 = P[0] - 'x'
-  // [if v0 == 0]
-  //   v1 = P[1]
-  // dst = phi(v0, v1)
-  //
-
-  IRBuilder<> B(CI->getParent());
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-  Type *RetType = CI->getType();
-
-  B.SetInsertPoint(CI);
-  BasicBlock *InitialBB = B.GetInsertBlock();
-  Value *Str1FirstCharacterValue =
-      B.CreateZExt(B.CreateLoad(B.getInt8Ty(), NonConstantP), RetType);
-  Value *Str2FirstCharacterValue =
-      ConstantInt::get(RetType, static_cast<unsigned char>(ConstantStr[0]));
-  Value *FirstCharacterSub =
-      B.CreateNSWSub(Str1FirstCharacterValue, Str2FirstCharacterValue);
-  Value *IsFirstCharacterSubZero =
-      B.CreateICmpEQ(FirstCharacterSub, ConstantInt::get(RetType, 0));
-  Instruction *IsFirstCharacterSubZeroBBTerminator = SplitBlockAndInsertIfThen(
-      IsFirstCharacterSubZero, CI, /*Unreachable*/ false,
-      /*BranchWeights*/ nullptr, &DTU);
-
-  B.SetInsertPoint(IsFirstCharacterSubZeroBBTerminator);
-  B.GetInsertBlock()->setName("strcmp_expand_sub_is_zero");
-  BasicBlock *IsFirstCharacterSubZeroBB = B.GetInsertBlock();
-  Value *Str1SecondCharacterValue = B.CreateZExt(
-      B.CreateLoad(B.getInt8Ty(), B.CreateConstInBoundsGEP1_64(
-                                      B.getInt8Ty(), NonConstantP, 1)),
-      RetType);
-
-  B.SetInsertPoint(CI);
-  B.GetInsertBlock()->setName("strcmp_expand_sub_join");
-
-  PHINode *Result = B.CreatePHI(RetType, 2);
-  Result->addIncoming(FirstCharacterSub, InitialBB);
-  Result->addIncoming(Str1SecondCharacterValue, IsFirstCharacterSubZeroBB);
-
-  CI->replaceAllUsesWith(Result);
-  CI->eraseFromParent();
-
-  MadeCFGChange = true;
-
-  return true;
-}
-
-static bool foldLibraryCalls(Instruction &I, TargetTransformInfo &TTI,
-                             TargetLibraryInfo &TLI, DominatorTree &DT,
-                             AssumptionCache &AC, bool &MadeCFGChange) {
-  CallInst *CI = dyn_cast<CallInst>(&I);
-  if (!CI)
-    return false;
-
-  LibFunc Func;
-  Module *M = I.getModule();
-  if (!TLI.getLibFunc(*CI, Func) || !isLibFuncEmittable(M, &TLI, Func))
-    return false;
-
-  switch (Func) {
-  case LibFunc_sqrt:
-  case LibFunc_sqrtf:
-  case LibFunc_sqrtl:
-    return foldSqrt(CI, TTI, TLI, AC, DT);
-  case LibFunc_strcmp:
-    return expandStrcmp(CI, DT, MadeCFGChange);
-  default:
-    break;
-  }
-
-  return false;
-}
-
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
 static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
                                 TargetTransformInfo &TTI,
                                 TargetLibraryInfo &TLI, AliasAnalysis &AA,
-                                AssumptionCache &AC, bool &MadeCFGChange) {
+                                AssumptionCache &AC) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -1046,7 +946,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.
-      MadeChange |= foldLibraryCalls(I, TTI, TLI, DT, AC, MadeCFGChange);
+      MadeChange |= foldSqrt(I, TTI, TLI, AC, DT);
     }
   }
 
@@ -1062,12 +962,12 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
 /// handled in the callers of this function.
 static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
                     TargetLibraryInfo &TLI, DominatorTree &DT,
-                    AliasAnalysis &AA, bool &ChangedCFG) {
+                    AliasAnalysis &AA) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
-  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, ChangedCFG);
+  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC);
   return MadeChange;
 }
 
@@ -1078,21 +978,12 @@ PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-
-  bool MadeCFGChange = false;
-
-  if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
+  if (!runImpl(F, AC, TTI, TLI, DT, AA)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
-
   // Mark all the analyses that instcombine updates as preserved.
   PreservedAnalyses PA;
-
-  if (MadeCFGChange)
-    PA.preserve<DominatorTreeAnalysis>();
-  else
-    PA.preserveSet<CFGAnalyses>();
-
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index d78ab1c1ea28..d0606c15f3d5 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -194,12 +194,49 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
   for (auto *DA : It->second)
     Visited.insert(DA->getParent());
 
+  SmallPtrSet<const BasicBlock *, 32> EscapingBBs;
+  for (auto *U : CB->users()) {
+    // The use from coroutine intrinsics are not a problem.
+    if (isa<CoroFreeInst, CoroSubFnInst, CoroSaveInst>(U))
+      continue;
+
+    // Think all other usages may be an escaping candidate conservatively.
+    //
+    // Note that the major user of switch ABI coroutine (the C++) will store
+    // resume.fn, destroy.fn and the index to the coroutine frame immediately.
+    // So the parent of the coro.begin in C++ will be always escaping.
+    // Then we can't get any performance benefits for C++ by improving the
+    // precision of the method.
+    //
+    // The reason why we still judge it is we want to make LLVM Coroutine in
+    // switch ABIs to be self contained as much as possible instead of a
+    // by-product of C++20 Coroutines.
+    EscapingBBs.insert(cast<Instruction>(U)->getParent());
+  }
+
+  bool PotentiallyEscaped = false;
+
   do {
     const auto *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (TIs.count(BB))
-      return true;
+
+    // A Path insensitive marker to test whether the coro.begin escapes.
+    // It is intentional to make it path insensitive while it may not be
+    // precise since we don't want the process to be too slow.
+    PotentiallyEscaped |= EscapingBBs.count(BB);
+
+    if (TIs.count(BB)) {
+      if (!BB->getTerminator()->isExceptionalTerminator() || PotentiallyEscaped)
+        return true;
+
+      // If the function ends with the exceptional terminator, the memory used
+      // by the coroutine frame can be released by stack unwinding
+      // automatically. So we can think the coro.begin doesn't escape if it
+      // exits the function by exceptional terminator.
+
+      continue;
+    }
 
     // Conservatively say that there is potentially a path.
     if (!--Limit)
@@ -236,36 +273,36 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // memory location storing that value and not the virtual register.
 
   SmallPtrSet<BasicBlock *, 8> Terminators;
-  // First gather all of the non-exceptional terminators for the function.
+  // First gather all of the terminators for the function.
   // Consider the final coro.suspend as the real terminator when the current
   // function is a coroutine.
-    for (BasicBlock &B : *F) {
-      auto *TI = B.getTerminator();
-      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-          !isa<UnreachableInst>(TI))
-        Terminators.insert(&B);
-    }
+  for (BasicBlock &B : *F) {
+    auto *TI = B.getTerminator();
+
+    if (TI->getNumSuccessors() != 0 || isa<UnreachableInst>(TI))
+      continue;
+
+    Terminators.insert(&B);
+  }
 
   // Filter out the coro.destroy that lie along exceptional paths.
   SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
   for (const auto &It : DestroyAddr) {
-    // If there is any coro.destroy dominates all of the terminators for the
-    // coro.begin, we could know the corresponding coro.begin wouldn't escape.
-    for (Instruction *DA : It.second) {
-      if (llvm::all_of(Terminators, [&](auto *TI) {
-            return DT.dominates(DA, TI->getTerminator());
-          })) {
-        ReferencedCoroBegins.insert(It.first);
-        break;
-      }
-    }
-
-    // Whether there is any paths from coro.begin to Terminators which not pass
-    // through any of the coro.destroys.
+    // If every terminators is dominated by coro.destroy, we could know the
+    // corresponding coro.begin wouldn't escape.
+    //
+    // Otherwise hasEscapePath would decide whether there is any paths from
+    // coro.begin to Terminators which not pass through any of the
+    // coro.destroys.
     //
     // hasEscapePath is relatively slow, so we avoid to run it as much as
     // possible.
-    if (!ReferencedCoroBegins.count(It.first) &&
+    if (llvm::all_of(Terminators,
+                     [&](auto *TI) {
+                       return llvm::any_of(It.second, [&](auto *DA) {
+                         return DT.dominates(DA, TI->getTerminator());
+                       });
+                     }) ||
         !hasEscapePath(It.first, Terminators))
       ReferencedCoroBegins.insert(It.first);
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index afd6e034f46d..767b7c7defbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -906,7 +906,7 @@ InstCombinerImpl::foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I) {
 
   auto NewFoldedConst = [&](bool IsTrueArm, Value *V) {
     bool IsCastOpRHS = (CastOp == RHS);
-    bool IsZExt = isa<ZExtInst>(CastOp);
+    bool IsZExt = isa<ZExtOperator>(CastOp);
     Constant *C;
 
     if (IsTrueArm) {
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 3e3be536defc..597cec8e61c9 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1777,6 +1777,20 @@ void CHR::cloneScopeBlocks(CHRScope *Scope,
       BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
       NewBlocks.push_back(NewBB);
       VMap[BB] = NewBB;
+
+      // Unreachable predecessors will not be cloned and will not have an edge
+      // to the cloned block. As such, also remove them from any phi nodes.
+      // To avoid iterator invalidation, first collect the dead predecessors
+      // from the first phi node, and then perform the actual removal.
+      if (auto *FirstPN = dyn_cast<PHINode>(NewBB->begin())) {
+        SmallVector<BasicBlock *> DeadPreds;
+        for (BasicBlock *Pred : FirstPN->blocks())
+          if (!DT.isReachableFromEntry(Pred))
+            DeadPreds.push_back(Pred);
+        for (PHINode &PN : make_early_inc_range(NewBB->phis()))
+          for (BasicBlock *Pred : DeadPreds)
+            PN.removeIncomingValue(Pred);
+      }
     }
 
   // Place the cloned blocks right after the original blocks (right before the
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 21f0b1a92293..75adcabc0d34 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -898,7 +898,9 @@ bool GCOVProfiler::emitProfileNotes(
 
           if (Line == Loc.getLine()) continue;
           Line = Loc.getLine();
-          if (SP != getDISubprogram(Loc.getScope()))
+          MDNode *Scope = Loc.getScope();
+          // TODO: Handle blocks from another file due to #line, #include, etc.
+          if (isa<DILexicalBlockFile>(Scope) || SP != getDISubprogram(Scope))
             continue;
 
           GCOVLines &Lines = Block.getFile(Filename);
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 15628d32280d..2b88dd08d88b 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -424,7 +424,7 @@ static Decomposition decompose(Value *V,
       return MergeResults(Op0, Op1, IsSigned);
 
     ConstantInt *CI;
-    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI)))) {
+    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) {
       auto Result = decompose(Op0, Preconditions, IsSigned, DL);
       Result.mul(CI->getSExtValue());
       return Result;
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 68642a01b37c..00937e0d734a 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,7 +69,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot,    "Number of call slot optimizations performed");
-STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
 
 namespace {
 
@@ -731,23 +730,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
     return true;
   }
 
-  // If this is a load-store pair from a stack slot to a stack slot, we
-  // might be able to perform the stack-move optimization just as we do for
-  // memcpys from an alloca to an alloca.
-  if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
-    if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
-      if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
-                                DL.getTypeStoreSize(T), BAA)) {
-        // Avoid invalidating the iterator.
-        BBI = SI->getNextNonDebugInstruction()->getIterator();
-        eraseInstruction(SI);
-        eraseInstruction(LI);
-        ++NumMemCpyInstr;
-        return true;
-      }
-    }
-  }
-
   return false;
 }
 
@@ -1426,217 +1408,6 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   return true;
 }
 
-// Attempts to optimize the pattern whereby memory is copied from an alloca to
-// another alloca, where the two allocas don't have conflicting mod/ref. If
-// successful, the two allocas can be merged into one and the transfer can be
-// deleted. This pattern is generated frequently in Rust, due to the ubiquity of
-// move operations in that language.
-//
-// Once we determine that the optimization is safe to perform, we replace all
-// uses of the destination alloca with the source alloca. We also "shrink wrap"
-// the lifetime markers of the single merged alloca to before the first use
-// and after the last use. Note that the "shrink wrapping" procedure is a safe
-// transformation only because we restrict the scope of this optimization to
-// allocas that aren't captured.
-bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                                          AllocaInst *DestAlloca,
-                                          AllocaInst *SrcAlloca, uint64_t Size,
-                                          BatchAAResults &BAA) {
-  LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
-                    << *Store << "\n");
-
-  // Make sure the two allocas are in the same address space.
-  if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
-    return false;
-  }
-
-  // 1. Check that copy is full. Calculate the static size of the allocas to be
-  // merged, bail out if we can't.
-  const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
-  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
-  if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
-    return false;
-  }
-  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
-  if (!DestSize || DestSize->isScalable() ||
-      Size != DestSize->getFixedValue()) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
-    return false;
-  }
-
-  // 2-1. Check that src and dest are static allocas, which are not affected by
-  // stacksave/stackrestore.
-  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() ||
-      SrcAlloca->getParent() != Load->getParent() ||
-      SrcAlloca->getParent() != Store->getParent())
-    return false;
-
-  // 2-2. Check that src and dest are never captured, unescaped allocas. Also
-  // collect lifetime markers first/last users in order to shrink wrap the
-  // lifetimes, and instructions with noalias metadata to remove them.
-
-  SmallVector<Instruction *, 4> LifetimeMarkers;
-  Instruction *FirstUser = nullptr, *LastUser = nullptr;
-  SmallSet<Instruction *, 4> NoAliasInstrs;
-
-  // Recursively track the user and check whether modified alias exist.
-  auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
-    bool CanBeNull, CanBeFreed;
-    return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
-  };
-
-  auto CaptureTrackingWithModRef =
-      [&](Instruction *AI,
-          function_ref<bool(Instruction *)> ModRefCallback) -> bool {
-    SmallVector<Instruction *, 8> Worklist;
-    Worklist.push_back(AI);
-    unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
-    Worklist.reserve(MaxUsesToExplore);
-    SmallSet<const Use *, 20> Visited;
-    while (!Worklist.empty()) {
-      Instruction *I = Worklist.back();
-      Worklist.pop_back();
-      for (const Use &U : I->uses()) {
-        if (Visited.size() >= MaxUsesToExplore) {
-          LLVM_DEBUG(
-              dbgs()
-              << "Stack Move: Exceeded max uses to see ModRef, bailing\n");
-          return false;
-        }
-        if (!Visited.insert(&U).second)
-          continue;
-        switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
-        case UseCaptureKind::MAY_CAPTURE:
-          return false;
-        case UseCaptureKind::PASSTHROUGH:
-          // Instructions cannot have non-instruction users.
-          Worklist.push_back(cast<Instruction>(U.getUser()));
-          continue;
-        case UseCaptureKind::NO_CAPTURE: {
-          auto *UI = cast<Instruction>(U.getUser());
-          if (DestAlloca->getParent() != UI->getParent())
-            return false;
-          if (!FirstUser || UI->comesBefore(FirstUser))
-            FirstUser = UI;
-          if (!LastUser || LastUser->comesBefore(UI))
-            LastUser = UI;
-          if (UI->isLifetimeStartOrEnd()) {
-            // We note the locations of these intrinsic calls so that we can
-            // delete them later if the optimization succeeds, this is safe
-            // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
-            // conceptually fill all the bytes of the alloca with an undefined
-            // value.
-            int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
-            if (Size < 0 || Size == DestSize) {
-              LifetimeMarkers.push_back(UI);
-              continue;
-            }
-          }
-          if (UI->hasMetadata(LLVMContext::MD_noalias))
-            NoAliasInstrs.insert(UI);
-          if (!ModRefCallback(UI))
-            return false;
-        }
-        }
-      }
-    }
-    return true;
-  };
-
-  // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics,
-  // from the alloca to the Store.
-  ModRefInfo DestModRef = ModRefInfo::NoModRef;
-  MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
-  auto DestModRefCallback = [&](Instruction *UI) -> bool {
-    // We don't care about the store itself.
-    if (UI == Store)
-      return true;
-    ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
-    // FIXME: For multi-BB cases, we need to see reachability from it to
-    // store.
-    // Bailout if Dest may have any ModRef before Store.
-    if (UI->comesBefore(Store) && isModOrRefSet(Res))
-      return false;
-    DestModRef |= BAA.getModRefInfo(UI, DestLoc);
-
-    return true;
-  };
-
-  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
-    return false;
-
-  // 3. Check that, from after the Load to the end of the BB,
-  // 3-1. if the dest has any Mod, src has no Ref, and
-  // 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes.
-  MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
-
-  auto SrcModRefCallback = [&](Instruction *UI) -> bool {
-    // Any ModRef before Load doesn't matter, also Load and Store can be
-    // ignored.
-    if (UI->comesBefore(Load) || UI == Load || UI == Store)
-      return true;
-    ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
-    if ((isModSet(DestModRef) && isRefSet(Res)) ||
-        (isRefSet(DestModRef) && isModSet(Res)))
-      return false;
-
-    return true;
-  };
-
-  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
-    return false;
-
-  // We can do the transformation. First, align the allocas appropriately.
-  SrcAlloca->setAlignment(
-      std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
-
-  // Merge the two allocas.
-  DestAlloca->replaceAllUsesWith(SrcAlloca);
-  eraseInstruction(DestAlloca);
-
-  // Drop metadata on the source alloca.
-  SrcAlloca->dropUnknownNonDebugMetadata();
-
-  // Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists.
-  if (!LifetimeMarkers.empty()) {
-    LLVMContext &C = SrcAlloca->getContext();
-    IRBuilder<> Builder(C);
-
-    ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size);
-    // Create a new lifetime start marker before the first user of src or alloca
-    // users.
-    Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator());
-    Builder.CreateLifetimeStart(SrcAlloca, AllocaSize);
-
-    // Create a new lifetime end marker after the last user of src or alloca
-    // users.
-    // FIXME: If the last user is the terminator for the bb, we can insert
-    // lifetime.end marker to the immidiate post-dominator, but currently do
-    // nothing.
-    if (!LastUser->isTerminator()) {
-      Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator());
-      Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize);
-    }
-
-    // Remove all other lifetime markers.
-    for (Instruction *I : LifetimeMarkers)
-      eraseInstruction(I);
-  }
-
-  // As this transformation can cause memory accesses that didn't previously
-  // alias to begin to alias one another, we remove !noalias metadata from any
-  // uses of either alloca. This is conservative, but more precision doesn't
-  // seem worthwhile right now.
-  for (Instruction *I : NoAliasInstrs)
-    I->setMetadata(LLVMContext::MD_noalias, nullptr);
-
-  LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
-  NumStackMove++;
-  return true;
-}
-
 /// Perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1693,14 +1464,13 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
       AnyClobber, MemoryLocation::getForSource(M), BAA);
 
-  // There are five possible optimizations we can do for memcpy:
+  // There are four possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   //   c) memcpy from freshly alloca'd space or space that has just started
   //      its lifetime copies undefined data, and we can therefore eliminate
   //      the memcpy in favor of the data that was already at the destination.
   //   d) memcpy from a just-memset'd source can be turned into memset.
-  //   e) elimination of memcpy via stack-move optimization.
   if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
     if (Instruction *MI = MD->getMemoryInst()) {
       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
@@ -1719,8 +1489,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
       }
       if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        if (processMemCpyMemCpyDependence(M, MDep, BAA))
-          return true;
+        return processMemCpyMemCpyDependence(M, MDep, BAA);
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
           LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1739,27 +1508,6 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     }
   }
 
-  // If the transfer is from a stack slot to a stack slot, then we may be able
-  // to perform the stack-move optimization. See the comments in
-  // performStackMoveOptzn() for more details.
-  auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
-  if (!DestAlloca)
-    return false;
-  auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
-  if (!SrcAlloca)
-    return false;
-  ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
-  if (Len == nullptr)
-    return false;
-  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(),
-                            BAA)) {
-    // Avoid invalidating the iterator.
-    BBI = M->getNextNonDebugInstruction()->getIterator();
-    eraseInstruction(M);
-    ++NumMemCpyInstr;
-    return true;
-  }
-
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 4f1350e4ebb9..2031e70bee1d 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -675,6 +675,12 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
     if (CI->isByValArgument(I)) {
       copyLocalTempOfByValueOperandIntoArguments(CI, I);
+      // When eliminating a tail call, we modify the values of the arguments.
+      // Therefore, if the byval parameter has a readonly attribute, we have to
+      // remove it. It is safe because, from the perspective of a caller, the
+      // byval parameter is always treated as "readonly," even if the readonly
+      // attribute is removed.
+      F.removeParamAttr(I, Attribute::ReadOnly);
       ArgumentPHIs[I]->addIncoming(F.getArg(I), BB);
     } else
       ArgumentPHIs[I]->addIncoming(CI->getArgOperand(I), BB);
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 5b0951252c07..3ad97613fe7a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -227,9 +227,21 @@ static Value *convertStrToInt(CallInst *CI, StringRef &Str, Value *EndPtr,
   return ConstantInt::get(RetTy, Result);
 }
 
+static bool isOnlyUsedInComparisonWithZero(Value *V) {
+  for (User *U : V->users()) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+        if (C->isNullValue())
+          continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
                                  const DataLayout &DL) {
-  if (!isOnlyUsedInZeroComparison(CI))
+  if (!isOnlyUsedInComparisonWithZero(CI))
     return false;
 
   if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index aa924823e554..bc8e0413b339 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -3775,7 +3775,7 @@ void GNUELFDumper<ELFT>::printRelRelaReloc(const Relocation<ELFT> &R,
     if (!Fields[4].Str.empty()) {
       if (RelAddend < 0) {
         Addend = " - ";
-        RelAddend = std::abs(RelAddend);
+        RelAddend = -static_cast<uint64_t>(RelAddend);
       } else {
         Addend = " + ";
       }

From 8092e001bcd76c0b9fec2311f3a515aa60d2ed07 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Mon, 11 Sep 2023 15:44:52 +0200
Subject: [PATCH 3/3] Vendor import of llvm-project branch release/17.x
 llvmorg-17.0.0-rc4-10-g0176e8729ea4.

---
 clang/include/clang/AST/ExprConcepts.h        | 14 +++--
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +-
 clang/lib/Driver/Driver.cpp                   |  8 ++-
 clang/lib/Driver/ToolChains/AIX.cpp           |  6 +++
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |  8 +--
 clang/lib/Driver/ToolChains/Arch/X86.h        |  2 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  2 +-
 clang/lib/Frontend/FrontendAction.cpp         |  5 ++
 clang/lib/Sema/SemaExprCXX.cpp                | 25 ++++++---
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 17 +++++-
 .../Inclusions/Stdlib/StdSymbolMap.inc        | 54 +++++++++++++++++++
 compiler-rt/lib/builtins/aarch64/lse.S        | 40 ++++++++++++--
 .../sanitizer_common_interceptors_format.inc  | 16 ++++--
 libcxx/include/__config                       | 36 +++++++++----
 llvm/include/llvm/Analysis/LazyValueInfo.h    |  3 ++
 llvm/lib/Analysis/LazyValueInfo.cpp           |  9 ++++
 llvm/lib/Analysis/ScalarEvolution.cpp         |  2 +-
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 12 ++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 50 ++++++++++++++---
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  2 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 38 ++++++++++++-
 21 files changed, 297 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h
index d900e980852b..13d4568119eb 100644
--- a/clang/include/clang/AST/ExprConcepts.h
+++ b/clang/include/clang/AST/ExprConcepts.h
@@ -14,20 +14,21 @@
 #ifndef LLVM_CLANG_AST_EXPRCONCEPTS_H
 #define LLVM_CLANG_AST_EXPRCONCEPTS_H
 
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTConcept.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
-#include "clang/AST/DeclarationName.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/DeclarationName.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <utility>
 #include <string>
+#include <utility>
 
 namespace clang {
 class ASTStmtReader;
@@ -467,6 +468,13 @@ class NestedRequirement : public Requirement {
   }
 };
 
+using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
+
+/// \brief create a Requirement::SubstitutionDiagnostic with only a
+/// SubstitutedEntity and DiagLoc using Sema's allocator.
+Requirement::SubstitutionDiagnostic *
+createSubstDiagAt(Sema &S, SourceLocation Location, EntityPrinter Printer);
+
 } // namespace concepts
 
 /// C++2a [expr.prim.req]:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a3506df7d4e5..f09d1129b128 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2386,7 +2386,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   // functions. If the current target's C++ ABI requires this and this is a
   // member function, set its alignment accordingly.
   if (getTarget().getCXXABI().areMemberFunctionsAligned()) {
-    if (F->getPointerAlignment(getDataLayout()) < 2 && isa<CXXMethodDecl>(D))
+    if (isa<CXXMethodDecl>(D) && F->getPointerAlignment(getDataLayout()) < 2)
       F->setAlignment(std::max(llvm::Align(2), F->getAlign().valueOrOne()));
   }
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f6ea4d0b4366..bdbdad9362e1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4936,6 +4936,12 @@ void Driver::BuildJobs(Compilation &C) const {
   (void)C.getArgs().hasArg(options::OPT_driver_mode);
   (void)C.getArgs().hasArg(options::OPT_rsp_quoting);
 
+  bool HasAssembleJob = llvm::any_of(C.getJobs(), [](auto &J) {
+    // Match ClangAs and other derived assemblers of Tool. ClangAs uses a
+    // longer ShortName "clang integrated assembler" while other assemblers just
+    // use "assembler".
+    return strstr(J.getCreator().getShortName(), "assembler");
+  });
   for (Arg *A : C.getArgs()) {
     // FIXME: It would be nice to be able to send the argument to the
     // DiagnosticsEngine, so that extra values, position, and so on could be
@@ -4965,7 +4971,7 @@ void Driver::BuildJobs(Compilation &C) const {
       // already been warned about.
       if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) {
         if (A->getOption().hasFlag(options::TargetSpecific) &&
-            !A->isIgnoredTargetSpecific()) {
+            !A->isIgnoredTargetSpecific() && !HasAssembleJob) {
           Diag(diag::err_drv_unsupported_opt_for_target)
               << A->getSpelling() << getTargetTriple();
         } else {
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 97217eba9ca0..bfc86d9f3471 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -30,6 +30,7 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfoList &Inputs,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {
+  const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   const bool IsArch32Bit = getToolChain().getTriple().isArch32Bit();
@@ -38,6 +39,11 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   if (!IsArch32Bit && !IsArch64Bit)
     llvm_unreachable("Unsupported bit width value.");
 
+  if (Arg *A = C.getArgs().getLastArg(options::OPT_G)) {
+    D.Diag(diag::err_drv_unsupported_opt_for_target)
+        << A->getSpelling() << D.getTargetTriple();
+  }
+
   // Specify the mode in which the as(1) command operates.
   if (IsArch32Bit) {
     CmdArgs.push_back("-a32");
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 4383b8004143..cf2bc63d74ad 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -118,13 +118,7 @@ std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
 
 void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args,
-                               std::vector<StringRef> &Features, bool ForAS) {
-  if (ForAS) {
-    // Some target-specific options are only handled in AddX86TargetArgs, which
-    // is not called by ClangAs::ConstructJob. Claim them here.
-    Args.claimAllArgs(options::OPT_mfpmath_EQ);
-  }
-
+                               std::vector<StringRef> &Features) {
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
   if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.h b/clang/lib/Driver/ToolChains/Arch/X86.h
index 762a1fa6f4d5..e07387f3ece3 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.h
+++ b/clang/lib/Driver/ToolChains/Arch/X86.h
@@ -26,7 +26,7 @@ std::string getX86TargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
 
 void getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                           const llvm::opt::ArgList &Args,
-                          std::vector<llvm::StringRef> &Features, bool ForAS);
+                          std::vector<llvm::StringRef> &Features);
 
 } // end namespace x86
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8766d34eec53..0d6907b8e5c7 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -528,7 +528,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     break;
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    x86::getX86TargetFeatures(D, Triple, Args, Features, ForAS);
+    x86::getX86TargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::hexagon:
     hexagon::getHexagonTargetFeatures(D, Triple, Args, Features);
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index c6f958a6077b..0bd4b01ff79d 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/FileEntry.h"
 #include "clang/Basic/LangStandard.h"
 #include "clang/Basic/Sarif.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -1150,6 +1151,10 @@ void ASTFrontendAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
   if (!CI.hasPreprocessor())
     return;
+  // This is a fallback: If the client forgets to invoke this, we mark the
+  // current stack as the bottom. Though not optimal, this could help prevent
+  // stack overflow during deep recursion.
+  clang::noteBottomOfStack();
 
   // FIXME: Move the truncation aspect of this into Sema, we delayed this till
   // here so the source manager would be initialized.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 423d5372a6f6..1cff4a75790e 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
@@ -9072,16 +9073,24 @@ Sema::BuildExprRequirement(
     MultiLevelTemplateArgumentList MLTAL(Param, TAL.asArray(),
                                          /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
-    Expr *IDC = Param->getTypeConstraint()->getImmediatelyDeclaredConstraint();
+    const TypeConstraint *TC = Param->getTypeConstraint();
+    assert(TC && "Type Constraint cannot be null here");
+    auto *IDC = TC->getImmediatelyDeclaredConstraint();
+    assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
     ExprResult Constraint = SubstExpr(IDC, MLTAL);
     if (Constraint.isInvalid()) {
-      Status = concepts::ExprRequirement::SS_ExprSubstitutionFailure;
-    } else {
-      SubstitutedConstraintExpr =
-          cast<ConceptSpecializationExpr>(Constraint.get());
-      if (!SubstitutedConstraintExpr->isSatisfied())
-        Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
-    }
+      return new (Context) concepts::ExprRequirement(
+          concepts::createSubstDiagAt(*this, IDC->getExprLoc(),
+                                      [&](llvm::raw_ostream &OS) {
+                                        IDC->printPretty(OS, /*Helper=*/nullptr,
+                                                         getPrintingPolicy());
+                                      }),
+          IsSimple, NoexceptLoc, ReturnTypeRequirement);
+    }
+    SubstitutedConstraintExpr =
+        cast<ConceptSpecializationExpr>(Constraint.get());
+    if (!SubstitutedConstraintExpr->isSatisfied())
+      Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
   return new (Context) concepts::ExprRequirement(E, IsSimple, NoexceptLoc,
                                                  ReturnTypeRequirement, Status,
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8702e2ca3a1b..394006a57747 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2276,9 +2276,9 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType(
       getPackIndex(Pack), Arg, TL.getNameLoc());
 }
 
-template<typename EntityPrinter>
 static concepts::Requirement::SubstitutionDiagnostic *
-createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
+createSubstDiag(Sema &S, TemplateDeductionInfo &Info,
+                concepts::EntityPrinter Printer) {
   SmallString<128> Message;
   SourceLocation ErrorLoc;
   if (Info.hasSFINAEDiagnostic()) {
@@ -2302,6 +2302,19 @@ createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
       StringRef(MessageBuf, Message.size())};
 }
 
+concepts::Requirement::SubstitutionDiagnostic *
+concepts::createSubstDiagAt(Sema &S, SourceLocation Location,
+                            EntityPrinter Printer) {
+  SmallString<128> Entity;
+  llvm::raw_svector_ostream OS(Entity);
+  Printer(OS);
+  char *EntityBuf = new (S.Context) char[Entity.size()];
+  llvm::copy(Entity, EntityBuf);
+  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
+      /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()),
+      /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()};
+}
+
 ExprResult TemplateInstantiator::TransformRequiresTypeParams(
     SourceLocation KWLoc, SourceLocation RBraceLoc, const RequiresExpr *RE,
     RequiresExprBodyDecl *Body, ArrayRef<ParmVarDecl *> Params,
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
index a08ec11e77a4..b46bd2e4d7a4 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
@@ -3773,6 +3773,33 @@ SYMBOL(viewable_range, std::ranges::, <ranges>)
 SYMBOL(wistream_view, std::ranges::, <ranges>)
 SYMBOL(zip_transform_view, std::ranges::, <ranges>)
 SYMBOL(zip_view, std::ranges::, <ranges>)
+SYMBOL(all, std::ranges::views::, <ranges>)
+SYMBOL(all_t, std::ranges::views::, <ranges>)
+SYMBOL(as_const, std::ranges::views::, <ranges>)
+SYMBOL(as_rvalue, std::ranges::views::, <ranges>)
+SYMBOL(common, std::ranges::views::, <ranges>)
+SYMBOL(counted, std::ranges::views::, <ranges>)
+SYMBOL(drop, std::ranges::views::, <ranges>)
+SYMBOL(drop_while, std::ranges::views::, <ranges>)
+SYMBOL(elements, std::ranges::views::, <ranges>)
+SYMBOL(empty, std::ranges::views::, <ranges>)
+SYMBOL(filter, std::ranges::views::, <ranges>)
+SYMBOL(iota, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <iosfwd>)
+SYMBOL(join, std::ranges::views::, <ranges>)
+SYMBOL(join_with, std::ranges::views::, <ranges>)
+SYMBOL(keys, std::ranges::views::, <ranges>)
+SYMBOL(lazy_split, std::ranges::views::, <ranges>)
+SYMBOL(reverse, std::ranges::views::, <ranges>)
+SYMBOL(single, std::ranges::views::, <ranges>)
+SYMBOL(split, std::ranges::views::, <ranges>)
+SYMBOL(take, std::ranges::views::, <ranges>)
+SYMBOL(take_while, std::ranges::views::, <ranges>)
+SYMBOL(transform, std::ranges::views::, <ranges>)
+SYMBOL(values, std::ranges::views::, <ranges>)
+SYMBOL(zip, std::ranges::views::, <ranges>)
+SYMBOL(zip_transform, std::ranges::views::, <ranges>)
 SYMBOL(ECMAScript, std::regex_constants::, <regex>)
 SYMBOL(awk, std::regex_constants::, <regex>)
 SYMBOL(basic, std::regex_constants::, <regex>)
@@ -3817,3 +3844,30 @@ SYMBOL(get_id, std::this_thread::, <thread>)
 SYMBOL(sleep_for, std::this_thread::, <thread>)
 SYMBOL(sleep_until, std::this_thread::, <thread>)
 SYMBOL(yield, std::this_thread::, <thread>)
+SYMBOL(all, std::views::, <ranges>)
+SYMBOL(all_t, std::views::, <ranges>)
+SYMBOL(as_const, std::views::, <ranges>)
+SYMBOL(as_rvalue, std::views::, <ranges>)
+SYMBOL(common, std::views::, <ranges>)
+SYMBOL(counted, std::views::, <ranges>)
+SYMBOL(drop, std::views::, <ranges>)
+SYMBOL(drop_while, std::views::, <ranges>)
+SYMBOL(elements, std::views::, <ranges>)
+SYMBOL(empty, std::views::, <ranges>)
+SYMBOL(filter, std::views::, <ranges>)
+SYMBOL(iota, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <iosfwd>)
+SYMBOL(join, std::views::, <ranges>)
+SYMBOL(join_with, std::views::, <ranges>)
+SYMBOL(keys, std::views::, <ranges>)
+SYMBOL(lazy_split, std::views::, <ranges>)
+SYMBOL(reverse, std::views::, <ranges>)
+SYMBOL(single, std::views::, <ranges>)
+SYMBOL(split, std::views::, <ranges>)
+SYMBOL(take, std::views::, <ranges>)
+SYMBOL(take_while, std::views::, <ranges>)
+SYMBOL(transform, std::views::, <ranges>)
+SYMBOL(values, std::views::, <ranges>)
+SYMBOL(zip, std::views::, <ranges>)
+SYMBOL(zip_transform, std::views::, <ranges>)
diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S
index 5dc0d5320b5a..1fe18f4a4681 100644
--- a/compiler-rt/lib/builtins/aarch64/lse.S
+++ b/compiler-rt/lib/builtins/aarch64/lse.S
@@ -7,7 +7,7 @@
 // Out-of-line LSE atomics helpers. Ported from libgcc library.
 // N = {1, 2, 4, 8}
 // M = {1, 2, 4, 8, 16}
-// ORDER = {'relax', 'acq', 'rel', 'acq_rel'}
+// ORDER = {'relax', 'acq', 'rel', 'acq_rel', 'sync'}
 // Routines implemented:
 //
 //  iM __aarch64_casM_ORDER(iM expected, iM desired, iM *ptr)
@@ -35,8 +35,8 @@ HIDDEN(___aarch64_have_lse_atomics)
 #endif
 
 // Generate mnemonics for
-// L_cas:                                 SIZE: 1,2,4,8,16 MODEL: 1,2,3,4
-// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8    MODEL: 1,2,3,4
+// L_cas:                                 SIZE: 1,2,4,8,16 MODEL: 1,2,3,4,5
+// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8    MODEL: 1,2,3,4,5
 
 #if SIZE == 1
 #define S b
@@ -64,24 +64,44 @@ HIDDEN(___aarch64_have_lse_atomics)
 #define L
 #define M 0x000000
 #define N 0x000000
+#define BARRIER
 #elif MODEL == 2
 #define SUFF _acq
 #define A a
 #define L
 #define M 0x400000
 #define N 0x800000
+#define BARRIER
 #elif MODEL == 3
 #define SUFF _rel
 #define A
 #define L l
 #define M 0x008000
 #define N 0x400000
+#define BARRIER
 #elif MODEL == 4
 #define SUFF _acq_rel
 #define A a
 #define L l
 #define M 0x408000
 #define N 0xc00000
+#define BARRIER
+#elif MODEL == 5
+#define SUFF _sync
+#ifdef L_swp
+// swp has _acq semantics.
+#define A a
+#define L
+#define M 0x400000
+#define N 0x800000
+#else
+// All other _sync functions have _seq semantics.
+#define A a
+#define L l
+#define M 0x408000
+#define N 0xc00000
+#endif
+#define BARRIER dmb ish
 #else
 #error
 #endif // MODEL
@@ -96,7 +116,12 @@ HIDDEN(___aarch64_have_lse_atomics)
 #endif
 
 #define NAME(BASE) GLUE4(__aarch64_, BASE, SIZE, SUFF)
+#if MODEL == 5
+// Drop A for _sync functions.
+#define LDXR GLUE3(ld, xr, S)
+#else
 #define LDXR GLUE4(ld, A, xr, S)
+#endif
 #define STXR GLUE4(st, L, xr, S)
 
 // Define temporary registers.
@@ -136,9 +161,15 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas))
         STXR   w(tmp1), s(1), [x2]
         cbnz   w(tmp1), 0b
 1:
+        BARRIER
         ret
 #else
+#if MODEL == 5
+// Drop A for _sync functions.
+#define LDXP GLUE2(ld, xp)
+#else
 #define LDXP GLUE3(ld, A, xp)
+#endif
 #define STXP GLUE3(st, L, xp)
 #ifdef HAS_ASM_LSE
 #define CASP GLUE3(casp, A, L)  x0, x1, x2, x3, [x4]
@@ -159,6 +190,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas))
         STXP   w(tmp2), x2, x3, [x4]
         cbnz   w(tmp2), 0b
 1:
+        BARRIER
         ret
 #endif
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(cas))
@@ -180,6 +212,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(swp))
         LDXR   s(0), [x1]
         STXR   w(tmp1), s(tmp0), [x1]
         cbnz   w(tmp1), 0b
+        BARRIER
         ret
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(swp))
 #endif // L_swp
@@ -224,6 +257,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(LDNM))
         OP     s(tmp1), s(0), s(tmp0)
         STXR   w(tmp2), s(tmp1), [x1]
         cbnz   w(tmp2), 0b
+        BARRIER
         ret
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM))
 #endif // L_ldadd L_ldclr L_ldeor L_ldset
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
index 220abb89c3be..24485900644b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
@@ -340,11 +340,19 @@ static void scanf_common(void *ctx, int n_inputs, bool allowGnuMalloc,
       size = 0;
     }
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, argp, size);
-    // For %ms/%mc, write the allocated output buffer as well.
+    // For %mc/%mC/%ms/%m[/%mS, write the allocated output buffer as well.
     if (dir.allocate) {
-      char *buf = *(char **)argp;
-      if (buf)
-        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, internal_strlen(buf) + 1);
+      if (char *buf = *(char **)argp) {
+        if (dir.convSpecifier == 'c')
+          size = 1;
+        else if (dir.convSpecifier == 'C')
+          size = sizeof(wchar_t);
+        else if (dir.convSpecifier == 'S')
+          size = (internal_wcslen((wchar_t *)buf) + 1) * sizeof(wchar_t);
+        else  // 's' or '['
+          size = internal_strlen(buf) + 1;
+        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, size);
+      }
     }
   }
 }
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9759d3b9e8e0..43f8a20031ff 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -208,19 +208,16 @@
 
 // HARDENING {
 
-// TODO(hardening): remove this in LLVM 18.
-// This is for backward compatibility -- make enabling `_LIBCPP_ENABLE_ASSERTIONS` (which predates hardening modes)
-// equivalent to setting the hardened mode.
-#  ifdef _LIBCPP_ENABLE_ASSERTIONS
-#    warning "_LIBCPP_ENABLE_ASSERTIONS is deprecated, please use _LIBCPP_ENABLE_HARDENED_MODE instead."
-#    if _LIBCPP_ENABLE_ASSERTIONS != 0 && _LIBCPP_ENABLE_ASSERTIONS != 1
-#      error "_LIBCPP_ENABLE_ASSERTIONS must be set to 0 or 1"
-#    endif
-#    if _LIBCPP_ENABLE_ASSERTIONS
-#      define _LIBCPP_ENABLE_HARDENED_MODE 1
-#    endif
+#  ifndef _LIBCPP_ENABLE_ASSERTIONS
+#    define _LIBCPP_ENABLE_ASSERTIONS _LIBCPP_ENABLE_ASSERTIONS_DEFAULT
+#  endif
+#  if _LIBCPP_ENABLE_ASSERTIONS != 0 && _LIBCPP_ENABLE_ASSERTIONS != 1
+#    error "_LIBCPP_ENABLE_ASSERTIONS must be set to 0 or 1"
 #  endif
 
+// NOTE: These modes are experimental and are not stable yet in LLVM 17. Please refrain from using them and use the
+// documented libc++ "safe" mode instead.
+//
 // Enables the hardened mode which consists of all checks intended to be used in production. Hardened mode prioritizes
 // security-critical checks that can be done with relatively little overhead in constant time. Mutually exclusive with
 // `_LIBCPP_ENABLE_DEBUG_MODE`.
@@ -275,6 +272,11 @@
 #    error "Only one of _LIBCPP_ENABLE_HARDENED_MODE and _LIBCPP_ENABLE_DEBUG_MODE can be enabled."
 #  endif
 
+#  if _LIBCPP_ENABLE_ASSERTIONS && (_LIBCPP_ENABLE_HARDENED_MODE || _LIBCPP_ENABLE_DEBUG_MODE)
+#    error                                                                                                             \
+        "_LIBCPP_ENABLE_ASSERTIONS is mutually exclusive with _LIBCPP_ENABLE_HARDENED_MODE and _LIBCPP_ENABLE_DEBUG_MODE."
+#  endif
+
 // Hardened mode checks.
 
 // clang-format off
@@ -303,6 +305,18 @@
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSERT(expression, message)
 
+// Safe mode checks.
+
+#  elif _LIBCPP_ENABLE_ASSERTIONS
+
+// All checks enabled.
+#    define _LIBCPP_ASSERT_VALID_INPUT_RANGE(expression, message)         _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)      _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSERT(expression, message)
+
 // Disable all checks if hardening is not enabled.
 
 #  else
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index b109b7f7e65a..7b2bfdac75a8 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -115,6 +115,9 @@ class LazyValueInfo {
   /// PredBB to OldSucc to be from PredBB to NewSucc instead.
   void threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc);
 
+  /// Remove information related to this value from the cache.
+  void forgetValue(Value *V);
+
   /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 33651783cb17..2ba6036056d9 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -465,6 +465,10 @@ class LazyValueInfoImpl {
     F.print(OS, &Writer);
   }
 
+  /// This is part of the update interface to remove information related to this
+  /// value from the cache.
+  void forgetValue(Value *V) { TheCache.eraseValue(V); }
+
   /// This is part of the update interface to inform the cache
   /// that a block has been deleted.
   void eraseBlock(BasicBlock *BB) {
@@ -1969,6 +1973,11 @@ void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   }
 }
 
+void LazyValueInfo::forgetValue(Value *V) {
+  if (PImpl)
+    getImpl(PImpl, AC, nullptr).forgetValue(V);
+}
+
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
   if (PImpl) {
     getImpl(PImpl, AC, BB->getModule()).eraseBlock(BB);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 111d4d30aab9..39ab48b4a48e 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6833,7 +6833,7 @@ const ConstantRange &ScalarEvolution::getRangeRef(
         if (llvm::isKnownNonZero(V, DL))
           MinVal = Align;
         ConservativeResult = ConservativeResult.intersectWith(
-            {MinVal, MaxVal + 1}, RangeType);
+            ConstantRange::getNonEmpty(MinVal, MaxVal + 1), RangeType);
       }
     }
 
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 952f454f8f6a..7979ac9a5fb7 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1424,7 +1424,17 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
   // CompositeNode we should choose only one either Real or Imag instruction to
   // use as an anchor for generating complex instruction.
   auto It = RootToNode.find(RootI);
-  if (It != RootToNode.end() && It->second->Real == RootI) {
+  if (It != RootToNode.end()) {
+    auto RootNode = It->second;
+    assert(RootNode->Operation ==
+           ComplexDeinterleavingOperation::ReductionOperation);
+    // Find out which part, Real or Imag, comes later, and only if we come to
+    // the latest part, add it to OrderedRoots.
+    auto *R = cast<Instruction>(RootNode->Real);
+    auto *I = cast<Instruction>(RootNode->Imag);
+    auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+    if (ReplacementAnchor != RootI)
+      return false;
     OrderedRoots.push_back(RootI);
     return true;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0605dfa63793..c7a6dd7deb45 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13840,7 +13840,17 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4:
   case Intrinsic::aarch64_neon_ld1x2:
   case Intrinsic::aarch64_neon_ld1x3:
-  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld1x4: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile loads with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::aarch64_neon_ld2lane:
   case Intrinsic::aarch64_neon_ld3lane:
   case Intrinsic::aarch64_neon_ld4lane:
@@ -13848,9 +13858,13 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld3r:
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    // ldx return struct with the same vec type
+    Type *RetTy = I.getType();
+    auto *StructTy = cast<StructType>(RetTy);
+    unsigned NumElts = StructTy->getNumElements();
+    Type *VecTy = StructTy->getElementType(0);
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
@@ -13863,20 +13877,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_st4:
   case Intrinsic::aarch64_neon_st1x2:
   case Intrinsic::aarch64_neon_st1x3:
-  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st1x4: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    unsigned NumElts = 0;
+    for (const Value *Arg : I.args()) {
+      Type *ArgTy = Arg->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile stores with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::aarch64_neon_st2lane:
   case Intrinsic::aarch64_neon_st3lane:
   case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
+    // all the vector type is same
+    Type *VecTy = I.getArgOperand(0)->getType();
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+
     for (const Value *Arg : I.args()) {
       Type *ArgTy = Arg->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+      NumElts += 1;
     }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 24390f1b54f6..5b8f1b00dc03 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1269,6 +1269,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     if (IsLoadCSE) {
       LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
       combineMetadataForCSE(NLoadI, LoadI, false);
+      LVI->forgetValue(NLoadI);
     };
 
     // If the returned value is the load itself, replace with poison. This can
@@ -1461,6 +1462,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
   for (LoadInst *PredLoadI : CSELoads) {
     combineMetadataForCSE(PredLoadI, LoadI, true);
+    LVI->forgetValue(PredLoadI);
   }
 
   LoadI->replaceAllUsesWith(PN);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d7e40e8ef978..b603bbe55dc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3781,10 +3781,44 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   // the incoming edges.
   VPBasicBlock *Header =
       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
+
+  // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
+  // sank outside of the loop would keep the same order as they had in the
+  // original loop.
+  SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      fixReduction(ReductionPhi, State);
-    else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+      ReductionPHIList.emplace_back(ReductionPhi);
+  }
+  stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
+                                       const VPReductionPHIRecipe *R2) {
+    auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
+    auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
+
+    // If neither of the recipes has an intermediate store, keep the order the
+    // same.
+    if (!IS1 && !IS2)
+      return false;
+
+    // If only one of the recipes has an intermediate store, then move it
+    // towards the beginning of the list.
+    if (IS1 && !IS2)
+      return true;
+
+    if (!IS1 && IS2)
+      return false;
+
+    // If both recipes have an intermediate store, then the recipe with the
+    // later store should be processed earlier. So it should go to the beginning
+    // of the list.
+    return DT->dominates(IS2, IS1);
+  });
+
+  for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
+    fixReduction(ReductionPhi, State);
+
+  for (VPRecipeBase &R : Header->phis()) {
+    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
   }
 }