diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 41896d4392885..155a1fcda7b3a 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -146,10 +146,12 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: build-baseline
+          path: build-baseline
       - name: Download latest
         uses: actions/download-artifact@v3
         with:
           name: build-latest
+          path: build-latest
 
       - name: Install abi-compliance-checker
         run: sudo apt-get install abi-compliance-checker
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 424b10c62b256..fe8c134b8554e 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1844,8 +1844,9 @@ void RewriteInstance::adjustCommandLineOptions() {
     exit(1);
   }
 
-  if (opts::ReorderFunctions != ReorderFunctions::RT_NONE &&
-      !opts::HotText.getNumOccurrences()) {
+  if (opts::Instrument ||
+      (opts::ReorderFunctions != ReorderFunctions::RT_NONE &&
+       !opts::HotText.getNumOccurrences())) {
     opts::HotText = true;
   } else if (opts::HotText && !BC->HasRelocations) {
     errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
index 2ec6f45fc1595..9e6f1756c5707 100644
--- a/bolt/runtime/common.h
+++ b/bolt/runtime/common.h
@@ -165,6 +165,20 @@ int memcmp(const void *s1, const void *s2, size_t n) {
 // Anonymous namespace covering everything but our library entry point
 namespace {
 
+// Get the difference between runtime addrress of .text section and
+// static address in section header table. Can be extracted from arbitrary
+// pc value recorded at runtime to get the corresponding static address, which
+// in turn can be used to search for indirect call description. Needed because
+// indirect call descriptions are read-only non-relocatable data.
+uint64_t getTextBaseAddress() {
+  uint64_t DynAddr;
+  uint64_t StaticAddr;
+  __asm__ volatile("leaq __hot_end(%%rip), %0\n\t"
+                   "movabsq $__hot_end, %1\n\t"
+                   : "=r"(DynAddr), "=r"(StaticAddr));
+  return DynAddr - StaticAddr;
+}
+
 constexpr uint32_t BufSize = 10240;
 
 #define _STRINGIFY(x) #x
@@ -458,6 +472,16 @@ uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
   return ret;
 }
 
+int __ftruncate(uint64_t fd, uint64_t length) {
+  int ret;
+  __asm__ __volatile__("movq $77, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(length)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
 int __close(uint64_t fd) {
   uint64_t ret;
   __asm__ __volatile__("movq $3, %%rax\n"
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index ef55ded40431c..96a43f685befa 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -215,6 +215,12 @@ class BumpPtrAllocator {
 /// __bolt_instr_setup, our initialization routine.
 BumpPtrAllocator *GlobalAlloc;
 
+// Base address which we substract from recorded PC values when searching for
+// indirect call description entries. Needed because indCall descriptions are
+// mapped read-only and contain static addresses. Initialized in
+// __bolt_instr_setup.
+uint64_t TextBaseAddress = 0;
+
 // Storage for GlobalAlloc which can be shared if not using
 // instrumentation-file-append-pid.
 void *GlobalMetadataStorage;
@@ -1389,7 +1395,7 @@ void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
   const IndCallDescription *CallsiteDesc =
       &Ctx->IndCallDescriptions[CallsiteID];
   const IndCallTargetDescription *TargetDesc =
-      Ctx->lookupIndCallTarget(Entry.Key);
+      Ctx->lookupIndCallTarget(Entry.Key - TextBaseAddress);
   if (!TargetDesc) {
     DEBUG(report("Failed to lookup indirect call target\n"));
     char LineBuf[BufSize];
@@ -1515,11 +1521,15 @@ extern "C" void __bolt_instr_clear_counters() {
 ///    on demand.
 ///
 extern "C" void __attribute((force_align_arg_pointer))
-__bolt_instr_data_dump() {
+__bolt_instr_data_dump(int FD) {
   // Already dumping
   if (!GlobalWriteProfileMutex->acquire())
     return;
 
+  int ret = __lseek(FD, 0, SEEK_SET);
+  assert(ret == 0, "Failed to lseek!");
+  ret = __ftruncate(FD, 0);
+  assert(ret == 0, "Failed to ftruncate!");
   BumpPtrAllocator HashAlloc;
   HashAlloc.setMaxSize(0x6400000);
   ProfileWriterContext Ctx = readDescriptions();
@@ -1527,8 +1537,6 @@ __bolt_instr_data_dump() {
 
   DEBUG(printStats(Ctx));
 
-  int FD = openProfile();
-
   BumpPtrAllocator Alloc;
   Alloc.setMaxSize(0x6400000);
   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
@@ -1544,7 +1552,6 @@ __bolt_instr_data_dump() {
   Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
 
   __fsync(FD);
-  __close(FD);
   __munmap(Ctx.MMapPtr, Ctx.MMapSize);
   __close(Ctx.FileDesc);
   HashAlloc.destroy();
@@ -1557,6 +1564,7 @@ __bolt_instr_data_dump() {
 void watchProcess() {
   timespec ts, rem;
   uint64_t Ellapsed = 0ull;
+  int FD = openProfile();
   uint64_t ppid;
   if (__bolt_instr_wait_forks) {
     // Store parent pgid
@@ -1568,7 +1576,7 @@ void watchProcess() {
     ppid = __getppid();
     if (ppid == 1) {
       // Parent already dead
-      __bolt_instr_data_dump();
+      __bolt_instr_data_dump(FD);
       goto out;
     }
   }
@@ -1581,7 +1589,7 @@ void watchProcess() {
     // so no need for us to keep dumping.
     if (__kill(ppid, 0) < 0) {
       if (__bolt_instr_no_counters_clear)
-        __bolt_instr_data_dump();
+        __bolt_instr_data_dump(FD);
       break;
     }
 
@@ -1589,13 +1597,14 @@ void watchProcess() {
       continue;
 
     Ellapsed = 0;
-    __bolt_instr_data_dump();
+    __bolt_instr_data_dump(FD);
     if (__bolt_instr_no_counters_clear == false)
       __bolt_instr_clear_counters();
   }
 
 out:;
   DEBUG(report("My parent process is dead, bye!\n"));
+  __close(FD);
   __exit(0);
 }
 
@@ -1606,6 +1615,7 @@ extern "C" void __bolt_instr_indirect_tailcall();
 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
   __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
   __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
+  TextBaseAddress = getTextBaseAddress();
 
   const uint64_t CountersStart =
       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
@@ -1691,8 +1701,11 @@ extern "C" __attribute((naked)) void __bolt_instr_start()
 /// This is hooking into ELF's DT_FINI
 extern "C" void __bolt_instr_fini() {
   __bolt_fini_trampoline();
-  if (__bolt_instr_sleep_time == 0)
-    __bolt_instr_data_dump();
+  if (__bolt_instr_sleep_time == 0) {
+    int FD = openProfile();
+    __bolt_instr_data_dump(FD);
+    __close(FD);
+  }
   DEBUG(report("Finished.\n"));
 }
 
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index fe27af87f9106..3a6da210e01f0 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -72,6 +72,9 @@
 if config.gnu_ld:
     config.available_features.add("gnu_ld")
 
+if lit.util.which("fuser"):
+    config.available_features.add("fuser")
+
 llvm_config.use_default_substitutions()
 
 llvm_config.config.environment["CLANG"] = config.bolt_clang
diff --git a/bolt/test/runtime/instrumentation-indirect-2.c b/bolt/test/runtime/instrumentation-indirect-2.c
new file mode 100644
index 0000000000000..7d19db14b77f0
--- /dev/null
+++ b/bolt/test/runtime/instrumentation-indirect-2.c
@@ -0,0 +1,168 @@
+// Check that indirect call hash tables properly register multiple calls,
+// and that calls from different processes don't get mixed up when using
+// --instrumentation-file-append-pid.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+__attribute__((noinline)) void funcA(int pid) { printf("funcA %d\n", pid); }
+__attribute__((noinline)) void funcB(int pid) { printf("funcB %d\n", pid); }
+__attribute__((noinline)) void funcC(int pid) { printf("funcC %d\n", pid); }
+__attribute__((noinline)) void funcD(int pid) { printf("funcD %d\n", pid); }
+__attribute__((noinline)) void funcE(int pid) { printf("funcE %d\n", pid); }
+__attribute__((noinline)) void funcF(int pid) { printf("funcF %d\n", pid); }
+__attribute__((noinline)) void funcG(int pid) { printf("funcG %d\n", pid); }
+__attribute__((noinline)) void funcH(int pid) { printf("funcH %d\n", pid); }
+__attribute__((noinline)) void funcI(int pid) { printf("funcI %d\n", pid); }
+__attribute__((noinline)) void funcJ(int pid) { printf("funcJ %d\n", pid); }
+__attribute__((noinline)) void funcK(int pid) { printf("funcK %d\n", pid); }
+__attribute__((noinline)) void funcL(int pid) { printf("funcL %d\n", pid); }
+__attribute__((noinline)) void funcM(int pid) { printf("funcM %d\n", pid); }
+__attribute__((noinline)) void funcN(int pid) { printf("funcN %d\n", pid); }
+__attribute__((noinline)) void funcO(int pid) { printf("funcO %d\n", pid); }
+__attribute__((noinline)) void funcP(int pid) { printf("funcP %d\n", pid); }
+
+int main() {
+
+  void (*funcs[])(int) = {funcA, funcB, funcC, funcD, funcE, funcF,
+                          funcG, funcH, funcI, funcJ, funcK, funcL,
+                          funcM, funcN, funcO, funcP};
+  int i;
+
+  switch (fork()) {
+  case -1:
+    printf("Failed to fork!\n");
+    exit(-1);
+    break;
+  case 0:
+    i = 0;
+    break;
+  default:
+    i = 1;
+    break;
+  }
+  int pid = getpid();
+  for (; i < sizeof(funcs) / sizeof(void *); i += 2) {
+    funcs[i](pid);
+  }
+
+  return 0;
+}
+/*
+REQUIRES: system-linux,shell,fuser
+
+RUN: %clang %cflags %s -o %t.exe -Wl,-q -pie -fpie
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
+RUN:   --conservative-instrumentation -o %t.instrumented_conservative \
+RUN: --instrumentation-sleep-time=1 --instrumentation-no-counters-clear \
+RUN: --instrumentation-wait-forks
+
+# Instrumented program needs to finish returning zero
+# Both output and profile must contain all 16 functions
+RUN: %t.instrumented_conservative > %t.output
+# Wait for profile and output to be fully written
+RUN: bash %S/wait_file.sh %t.output
+RUN: bash %S/wait_file.sh %t.fdata
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+RUN: cat %t.fdata | FileCheck %s --check-prefix=CHECK-COMMON-PROF
+
+CHECK-OUTPUT-DAG: funcA
+CHECK-OUTPUT-DAG: funcB
+CHECK-OUTPUT-DAG: funcC
+CHECK-OUTPUT-DAG: funcD
+CHECK-OUTPUT-DAG: funcE
+CHECK-OUTPUT-DAG: funcF
+CHECK-OUTPUT-DAG: funcG
+CHECK-OUTPUT-DAG: funcH
+CHECK-OUTPUT-DAG: funcI
+CHECK-OUTPUT-DAG: funcJ
+CHECK-OUTPUT-DAG: funcK
+CHECK-OUTPUT-DAG: funcL
+CHECK-OUTPUT-DAG: funcM
+CHECK-OUTPUT-DAG: funcN
+CHECK-OUTPUT-DAG: funcO
+CHECK-OUTPUT-DAG: funcP
+
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t \
+RUN:   --instrumentation-file-append-pid \
+RUN:   -o %t.instrumented
+
+RUN: %t.instrumented > %t.output
+# Wait till output is fully written in case child outlives parent
+RUN: bash %S/wait_file.sh %t.output
+# Make sure all functions were called
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+
+RUN: child_pid=$(cat %t.output | grep funcA | awk '{print $2;}')
+RUN: par_pid=$(cat %t.output | grep funcB | awk '{print $2;}')
+
+RUN: bash %S/wait_file.sh %t.$child_pid.fdata
+RUN: bash %S/wait_file.sh %t.$par_pid.fdata
+
+RUN: mv %t.$child_pid.fdata %t.child.fdata
+RUN: mv %t.$par_pid.fdata %t.parent.fdata
+
+# Instrumented binary must produce two profiles with only local calls
+# recorded. Functions called only in child should not appear in parent's
+# process and vice versa.
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-CHILD
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-NOCHILD
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-PARENT
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-NOPARENT
+
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+
+CHECK-NOCHILD-NOT: funcB
+CHECK-NOCHILD-NOT: funcD
+CHECK-NOCHILD-NOT: funcF
+CHECK-NOCHILD-NOT: funcH
+CHECK-NOCHILD-NOT: funcJ
+CHECK-NOCHILD-NOT: funcL
+CHECK-NOCHILD-NOT: funcN
+CHECK-NOCHILD-NOT: funcP
+
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+CHECK-NOPARENT-NOT: funcA
+CHECK-NOPARENT-NOT: funcC
+CHECK-NOPARENT-NOT: funcE
+CHECK-NOPARENT-NOT: funcG
+CHECK-NOPARENT-NOT: funcI
+CHECK-NOPARENT-NOT: funcK
+CHECK-NOPARENT-NOT: funcM
+CHECK-NOPARENT-NOT: funcO
+
+ */
diff --git a/bolt/test/runtime/wait_file.sh b/bolt/test/runtime/wait_file.sh
new file mode 100644
index 0000000000000..42d4c5b29e795
--- /dev/null
+++ b/bolt/test/runtime/wait_file.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+check_file() {
+    local file="$1"
+    if [ -z "$file" ]; then
+        echo "No file passed!"
+        exit 1
+    fi
+    if [ ! -f "$file" ]; then
+        return 1
+    fi
+
+    fuser -s "$file"
+    local ret=$?
+    if [ $ret -eq 1 ]; then # noone has file open
+        return 0
+    fi
+    if [ $ret -eq 0 ]; then # file open by some processes
+        return 1
+    fi
+    if [ $ret -eq 127 ]; then
+        echo "fuser command not found!"
+        exit 1
+    fi
+
+    echo "Unexpected exit code $ret from fuser!"
+    exit 1
+}
+
+wait_file() {
+    local file="$1"
+    local max_sleep=10
+    check_file "$file"
+    local ret=$?
+    while [ $ret -ne 0 ] && [ $max_sleep -ne 0 ]; do
+        sleep 1
+        max_sleep=$((max_sleep - 1))
+        check_file $file
+        ret=$?
+    done
+    if [ $max_sleep -eq 0 ]; then
+        echo "The file does not exist or the test hung!"
+        exit 1
+    fi
+
+}
+file="$1"
+wait_file "$file"
diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index d44d1e272b9b7..8b542d0b2dec2 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -34,6 +34,7 @@
 #include "support/MemoryTree.h"
 #include "support/ThreadsafeFS.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Format/Format.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/CompilationDatabase.h"
@@ -52,8 +53,8 @@
 #include <optional>
 #include <string>
 #include <type_traits>
-#include <vector>
 #include <utility>
+#include <vector>
 
 namespace clang {
 namespace clangd {
@@ -112,6 +113,7 @@ struct UpdateIndexCallbacks : public ParsingCallbacks {
                  FIndex(FIndex),
                  // shared_ptr extends lifetime
                  Stdlib(Stdlib)]() mutable {
+      clang::noteBottomOfStack();
       IndexFileIn IF;
       IF.Symbols = indexStandardLibrary(std::move(CI), Loc, *TFS);
       if (Stdlib->isBest(LO))
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 9708c67ca2883..b2c04ac4d5463 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -70,6 +70,8 @@ bool isIgnored(llvm::StringRef HeaderPath, HeaderFilter IgnoreHeaders) {
 bool mayConsiderUnused(
     const Inclusion &Inc, ParsedAST &AST,
     const include_cleaner::PragmaIncludes *PI) {
+  if (PI && PI->shouldKeep(Inc.HashLine + 1))
+      return false;
   // FIXME(kirillbobyrev): We currently do not support the umbrella headers.
   // System headers are likely to be standard library headers.
   // Until we have good support for umbrella headers, don't warn about them.
@@ -81,8 +83,6 @@ bool mayConsiderUnused(
       AST.getIncludeStructure().getRealPath(HID));
   assert(FE);
   if (PI) {
-    if (PI->shouldKeep(Inc.HashLine + 1))
-      return false;
     // Check if main file is the public interface for a private header. If so we
     // shouldn't diagnose it as unused.
     if (auto PHeader = PI->getPublic(*FE); !PHeader.empty()) {
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index dd2ce16147a5d..324ba1fc8cb89 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -63,6 +63,7 @@
 #include "support/ThreadCrashReporter.h"
 #include "support/Threading.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/FunctionExtras.h"
@@ -464,6 +465,10 @@ class PreambleThread {
   }
 
   void run() {
+    // We mark the current as the stack bottom so that clang running on this
+    // thread can notice the stack usage and prevent stack overflow with best
+    // efforts. Same applies to other calls thoughout clangd.
+    clang::noteBottomOfStack();
     while (true) {
       std::optional<PreambleThrottlerRequest> Throttle;
       {
@@ -1383,6 +1388,7 @@ void ASTWorker::startTask(llvm::StringRef Name,
 }
 
 void ASTWorker::run() {
+  clang::noteBottomOfStack();
   while (true) {
     {
       std::unique_lock<std::mutex> Lock(Mutex);
@@ -1777,6 +1783,7 @@ void TUScheduler::runWithPreamble(llvm::StringRef Name, PathRef File,
                Ctx = Context::current().derive(FileBeingProcessed,
                                                std::string(File)),
                Action = std::move(Action), this]() mutable {
+    clang::noteBottomOfStack();
     ThreadCrashReporter ScopedReporter([&Name, &Contents, &Command]() {
       llvm::errs() << "Signalled during preamble action: " << Name << "\n";
       crashDumpCompileCommand(llvm::errs(), Command);
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index c35de750435cc..7ef9511cf7c07 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -30,6 +30,7 @@
 #include "support/Trace.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
@@ -108,6 +109,7 @@ BackgroundIndex::BackgroundIndex(
   for (unsigned I = 0; I < Opts.ThreadPoolSize; ++I) {
     ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1),
                         [this, Ctx(Context::current().clone())]() mutable {
+                          clang::noteBottomOfStack();
                           WithContext BGContext(std::move(Ctx));
                           Queue.work([&] { Rebuilder.idle(); });
                         });
diff --git a/clang-tools-extra/clangd/test/infinite-instantiation.test b/clang-tools-extra/clangd/test/infinite-instantiation.test
new file mode 100644
index 0000000000000..85a1b656f4908
--- /dev/null
+++ b/clang-tools-extra/clangd/test/infinite-instantiation.test
@@ -0,0 +1,13 @@
+// RUN: cp %s %t.cpp
+// RUN: not clangd -check=%t.cpp 2>&1 | FileCheck -strict-whitespace %s
+
+// CHECK: [template_recursion_depth_exceeded]
+
+template <typename... T>
+constexpr int f(T... args) {
+  return f(0, args...);
+}
+
+int main() {
+  auto i = f();
+}
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index ca5cced197cd2..f656a8c587c65 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -29,6 +29,7 @@
 #include "support/ThreadCrashReporter.h"
 #include "support/ThreadsafeFS.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -710,6 +711,9 @@ enum class ErrorResultCode : int {
 };
 
 int clangdMain(int argc, char *argv[]) {
+  // Clang could run on the main thread. e.g., when the flag '-check' or '-sync'
+  // is enabled.
+  clang::noteBottomOfStack();
   llvm::InitializeAllTargetInfos();
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
   llvm::sys::AddSignalHandler(
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index c55351fb1f91d..83a7c45df1695 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -76,6 +76,8 @@ TEST(IncludeCleaner, StdlibUnused) {
   auto TU = TestTU::withCode(R"cpp(
     #include <list>
     #include <queue>
+    #include <vector> // IWYU pragma: keep
+    #include <string> // IWYU pragma: export
     std::list<int> x;
   )cpp");
   // Layout of std library impl is not relevant.
@@ -84,10 +86,13 @@ TEST(IncludeCleaner, StdlibUnused) {
     namespace std {
       template <typename> class list {};
       template <typename> class queue {};
+      template <typename> class vector {};
     }
   )cpp";
   TU.AdditionalFiles["list"] = "#include <bits>";
   TU.AdditionalFiles["queue"] = "#include <bits>";
+  TU.AdditionalFiles["vector"] = "#include <bits>";
+  TU.AdditionalFiles["string"] = "#include <bits>";
   TU.ExtraArgs = {"-isystem", testRoot()};
   auto AST = TU.build();
   IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST);
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5add59680fd76..b161be3a07752 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -273,6 +273,10 @@ Non-comprehensive list of changes in this release
   types. This allows access to ``llvm.nearbyint`` for arbitrary
   floating-point and vector of floating-point types.
 - Clang AST matcher now matches concept declarations with `conceptDecl`.
+- Clang now supports more GCC stdio builtins: ``__builtin_vprintf``, ``__builtin_vfprintf``,
+  ``__builtin_fscanf``, ``__builtin_scanf``, ``__builtin_sscanf``, ``__builtin_vfscanf``,
+  ``__builtin_vscanf``, ``__builtin_vsscanf``.
+
 
 New Compiler Flags
 ------------------
@@ -293,7 +297,11 @@ New Compiler Flags
 - ``-print-multi-flags-experimental`` prints the flags used for multilib
   selection. See `the multilib docs <https://clang.llvm.org/docs/Multilib.html>`_
   for more details.
-
+- ``-maix32`` and ``-maix64`` are new GCC compatibility flags that select the
+  bitmode to target on AIX.
+- ``-p`` is a new GCC compatibility flag for AIX and Linux which works
+  similarly to ``-pg`` by writing profile information, but targets the ``prof``
+  tool as opposed to the ``gprof`` tool.
 
 Deprecated Compiler Flags
 -------------------------
@@ -328,7 +336,9 @@ Attribute Changes in Clang
   the flag ``-Wunsafe-buffer-usage`` is enabled.
 - ``__declspec`` attributes can now be used together with the using keyword. Before
   the attributes on ``__declspec`` was ignored, while now it will be forwarded to the
-  point where the alias is used.
+  point where the alias is used. Note, some incorrect uses of ``__declspec`` on a
+  ``using`` declaration were being silently ignored and will now be appropriately
+  diagnosed as ignoring the attribute.
 - Introduced a new ``USR`` (unified symbol resolution) clause inside of the
   existing ``__attribute__((external_source_symbol))`` attribute. Clang's indexer
   uses the optional USR value when indexing Clang's AST. This value is expected
@@ -467,9 +477,8 @@ Improvements to Clang's diagnostics
 - ``-Wformat`` will no longer suggest a no-op fix-it for fixing scoped enum format
   warnings. Instead, it will suggest casting the enum object to the type specified
   in the format string.
-- Clang now emits ``-Wconstant-logical-operand`` warning even when constant logical
-  operand is on left side.
-  (`#37919 <https://github.com/llvm/llvm-project/issues/37919>`_)
+- Clang contexpr evaluator now displays notes as well as an error when a constructor
+  of a base class is not called in the constructor of its derived class.
 
 Bug Fixes in This Version
 -------------------------
@@ -702,6 +711,21 @@ Bug Fixes in This Version
 - Fix a hang on valid C code passing a function type as an argument to
   ``typeof`` to form a function declaration.
   (`#64713 <https://github.com/llvm/llvm-project/issues/64713>_`)
+- Fixed an issue where accesses to the local variables of a coroutine during
+  ``await_suspend`` could be misoptimized, including accesses to the awaiter
+  object itself.
+  (`#56301 <https://github.com/llvm/llvm-project/issues/56301>`_)
+  The current solution may bring performance regressions if the awaiters have
+  non-static data members. See
+  `#64945 <https://github.com/llvm/llvm-project/issues/64945>`_ for details.
+- Clang now correctly diagnoses ``function_needs_feature`` when always_inline
+  callee has incompatible target features with caller.
+- Removed the linking of libraries when ``-r`` is passed to the driver on AIX.
+- Fixed an Itanium ABI bug where we force exactly two-byte alignment on member
+  functions to reserve a bit in function pointers for identifying pointers to
+  virtual member functions even if the target required a greater function
+  alignment and/or did not have function pointers which point to function entry
+  points (i.e., uses function descriptor objects instead).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -822,6 +846,13 @@ Bug Fixes to C++ Support
 - Fix constraint checking of non-generic lambdas.
   (`#63181 <https://github.com/llvm/llvm-project/issues/63181>`_)
 
+- Update ``FunctionDeclBitfields.NumFunctionDeclBits``. This fixes:
+  (`#64171 <https://github.com/llvm/llvm-project/issues/64171>`_).
+
+- Fix a crash caused by substitution failure in expression requirements.
+  (`#64172 <https://github.com/llvm/llvm-project/issues/64172>`_) and
+  (`#64723 <https://github.com/llvm/llvm-project/issues/64723>`_).
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -977,10 +1008,19 @@ CUDA Support
 
 AIX Support
 ^^^^^^^^^^^
-- Add an AIX-only link-time option, `-mxcoff-build-id=0xHEXSTRING`, to allow users
-  to embed a hex id in their binary such that it's readable by the program itself.
-  This option is an alternative to the `--build-id=0xHEXSTRING` GNU linker option
-  which is currently not supported by the AIX linker.
+- Enabled ThinLTO support. Minimum OS requirement is AIX 7.2 TL5 SP6 or
+  the upcoming AIX 7.3 TL2.
+
+- Enabled integrated assembler (``-f[no-]integrated-as``) for LTO. LTO now
+  defaults to the integrated assembler.
+
+- Enabled Clang-based instrumented profiling
+  (``-fprofile-instr-[generate|use]``).
+
+- Added an AIX-only link-time option, ``-mxcoff-build-id=0xHEXSTRING``, to allow
+  users to embed a hex id in their binary such that it's readable by the program
+  itself. This option is an alternative to the ``--build-id=0xHEXSTRING`` GNU
+  linker option, which is currently not supported by the AIX linker.
 
 - Introduced the ``-mxcoff-roptr`` option to place constant objects with
   relocatable address values in the read-only data section. This option should
@@ -989,6 +1029,14 @@ AIX Support
   read-only data sections with relocatable address values that resolve to
   imported symbols are made writable.
 
+- Implemented the ``-frecord-command-line`` option on AIX, which saves the
+  command-line options used from compiling a source file to the corresponding
+  object file or binary file.
+
+- Added a new linker option, ``-K``, that is used to align the header, text,
+  data, and loader sections of the output file so that each section begins on
+  a page boundary.
+
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 - Shared library support (and PIC code generation) for WebAssembly is no longer
@@ -1005,6 +1053,11 @@ AVR Support
   of ``USHRT_MAX`` is now ``unsigned int`` instead of ``int``, as required by
   the C standard.
 
+PowerPC Support
+^^^^^^^^^^^^^^^
+- Clang now emits errors when it detects incompatible target features for
+  PowerPC builtins.
+
 DWARF Support in Clang
 ----------------------
 
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 1b99709ca90d9..12137387b676a 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1702,7 +1702,7 @@ class DeclContext {
   };
 
   /// Number of non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = 30 };
+  enum { NumFunctionDeclBits = 31 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1714,12 +1714,12 @@ class DeclContext {
     /// For the bits in FunctionDeclBitfields.
     uint64_t : NumFunctionDeclBits;
 
-    /// 21 bits to fit in the remaining available space.
+    /// 20 bits to fit in the remaining available space.
     /// Note that this makes CXXConstructorDeclBitfields take
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 18;
+    uint64_t NumCtorInitializers : 17;
     uint64_t IsInheritingConstructor : 1;
 
     /// Whether this constructor has a trail-allocated explicit specifier.
diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h
index d900e980852b4..13d4568119eb2 100644
--- a/clang/include/clang/AST/ExprConcepts.h
+++ b/clang/include/clang/AST/ExprConcepts.h
@@ -14,20 +14,21 @@
 #ifndef LLVM_CLANG_AST_EXPRCONCEPTS_H
 #define LLVM_CLANG_AST_EXPRCONCEPTS_H
 
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTConcept.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
-#include "clang/AST/DeclarationName.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/DeclarationName.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <utility>
 #include <string>
+#include <utility>
 
 namespace clang {
 class ASTStmtReader;
@@ -467,6 +468,13 @@ class NestedRequirement : public Requirement {
   }
 };
 
+using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
+
+/// \brief create a Requirement::SubstitutionDiagnostic with only a
+/// SubstitutedEntity and DiagLoc using Sema's allocator.
+Requirement::SubstitutionDiagnostic *
+createSubstDiagAt(Sema &S, SourceLocation Location, EntityPrinter Printer);
+
 } // namespace concepts
 
 /// C++2a [expr.prim.req]:
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 566cdc3406058..0794ed7ba6837 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -70,6 +70,8 @@ def note_consteval_address_accessible : Note<
   "is not a constant expression">;
 def note_constexpr_uninitialized : Note<
   "subobject %0 is not initialized">;
+def note_constexpr_uninitialized_base : Note<
+  "constructor of base class %0 is not called">;
 def note_constexpr_static_local : Note<
   "control flows through the definition of a %select{static|thread_local}0 variable">;
 def note_constexpr_subobject_declared_here : Note<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7b4d415bf0649..26bc88a980e4f 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -20,6 +20,7 @@ def DeprecatedStaticAnalyzerFlag : DiagGroup<"deprecated-static-analyzer-flag">;
 // Empty DiagGroups are recognized by clang but ignored.
 def ODR : DiagGroup<"odr">;
 def : DiagGroup<"abi">;
+def : DiagGroup<"gnu-empty-initializer">; // Now a C extension, not GNU.
 def AbsoluteValue : DiagGroup<"absolute-value">;
 def MisspelledAssumption : DiagGroup<"misspelled-assumption">;
 def UnknownAssumption : DiagGroup<"unknown-assumption">;
diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index db53010645ae3..4659e45c78834 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -23,7 +23,11 @@
 
 namespace llvm {
 class hash_code;
+class Triple;
+namespace opt {
+class ArgList;
 }
+} // namespace llvm
 
 namespace clang {
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 41ef47eb565b1..61be52149341f 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1414,7 +1414,9 @@ class TargetInfo : public TransferrableTargetInfo,
 
   /// Identify whether this target supports IFuncs.
   bool supportsIFunc() const {
-    return getTriple().isOSBinFormatELF() && !getTriple().isOSFuchsia();
+    return getTriple().isOSBinFormatELF() &&
+           ((getTriple().isOSLinux() && !getTriple().isMusl()) ||
+            getTriple().isOSFreeBSD());
   }
 
   // Validate the contents of the __builtin_cpu_supports(const char*)
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 7acacd7bf4f50..76000156fece7 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -9612,9 +9612,8 @@ bool ASTContext::areLaxCompatibleRVVTypes(QualType FirstType,
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
-      // If __riscv_v_fixed_vlen != N do not allow GNU vector lax conversion.
-      if (VecTy->getVectorKind() == VectorType::GenericVector &&
-          getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
+      // If __riscv_v_fixed_vlen != N do not allow vector lax conversion.
+      if (getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
         return false;
 
       // If -flax-vector-conversions=all is specified, the types are
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 2f2f4eef852fd..f1bad0c7f7f22 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2418,9 +2418,16 @@ static bool CheckEvaluationResult(CheckEvaluationResultKind CERK,
     if (const CXXRecordDecl *CD = dyn_cast<CXXRecordDecl>(RD)) {
       unsigned BaseIndex = 0;
       for (const CXXBaseSpecifier &BS : CD->bases()) {
-        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(),
-                                   Value.getStructBase(BaseIndex), Kind,
-                                   /*SubobjectDecl=*/nullptr, CheckedTemps))
+        const APValue &BaseValue = Value.getStructBase(BaseIndex);
+        if (!BaseValue.hasValue()) {
+          SourceLocation TypeBeginLoc = BS.getBaseTypeLoc();
+          Info.FFDiag(TypeBeginLoc, diag::note_constexpr_uninitialized_base)
+              << BS.getType() << SourceRange(TypeBeginLoc, BS.getEndLoc());
+          return false;
+        }
+        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(), BaseValue,
+                                   Kind, /*SubobjectDecl=*/nullptr,
+                                   CheckedTemps))
           return false;
         ++BaseIndex;
       }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index fcc1620f7a043..0d1e9ad439b7d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5239,30 +5239,50 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && ArgInfo.getCanBeFlattened()) {
         llvm::Type *SrcTy = Src.getElementType();
-        uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
-
-        // If the source type is smaller than the destination type of the
-        // coerce-to logic, copy the source value into a temp alloca the size
-        // of the destination type to allow loading all of it. The bits past
-        // the source value are left undef.
-        if (SrcSize < DstSize) {
-          Address TempAlloca
-            = CreateTempAlloca(STy, Src.getAlignment(),
-                               Src.getName() + ".coerce");
-          Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
-          Src = TempAlloca;
+        llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        if (SrcTypeSize.isScalable()) {
+          assert(STy->containsHomogeneousScalableVectorTypes() &&
+                 "ABI only supports structure with homogeneous scalable vector "
+                 "type");
+          assert(SrcTypeSize == DstTypeSize &&
+                 "Only allow non-fractional movement of structure with "
+                 "homogeneous scalable vector type");
+          assert(NumIRArgs == STy->getNumElements());
+
+          llvm::Value *StoredStructValue =
+              Builder.CreateLoad(Src, Src.getName() + ".tuple");
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            llvm::Value *Extract = Builder.CreateExtractValue(
+                StoredStructValue, i, Src.getName() + ".extract" + Twine(i));
+            IRCallArgs[FirstIRArg + i] = Extract;
+          }
         } else {
-          Src = Src.withElementType(STy);
-        }
+          uint64_t SrcSize = SrcTypeSize.getFixedValue();
+          uint64_t DstSize = DstTypeSize.getFixedValue();
+
+          // If the source type is smaller than the destination type of the
+          // coerce-to logic, copy the source value into a temp alloca the size
+          // of the destination type to allow loading all of it. The bits past
+          // the source value are left undef.
+          if (SrcSize < DstSize) {
+            Address TempAlloca = CreateTempAlloca(STy, Src.getAlignment(),
+                                                  Src.getName() + ".coerce");
+            Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
+            Src = TempAlloca;
+          } else {
+            Src = Src.withElementType(STy);
+          }
 
-        assert(NumIRArgs == STy->getNumElements());
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Address EltPtr = Builder.CreateStructGEP(Src, i);
-          llvm::Value *LI = Builder.CreateLoad(EltPtr);
-          if (ArgHasMaybeUndefAttr)
-            LI = Builder.CreateFreeze(LI);
-          IRCallArgs[FirstIRArg + i] = LI;
+          assert(NumIRArgs == STy->getNumElements());
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            Address EltPtr = Builder.CreateStructGEP(Src, i);
+            llvm::Value *LI = Builder.CreateLoad(EltPtr);
+            if (ArgHasMaybeUndefAttr)
+              LI = Builder.CreateFreeze(LI);
+            IRCallArgs[FirstIRArg + i] = LI;
+          }
         }
       } else {
         // In the simple case, just pass the coerced loaded value.
@@ -5467,6 +5487,30 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
   }
 
+  // The await_suspend call performed by co_await is essentially asynchronous
+  // to the execution of the coroutine. Inlining it normally into an unsplit
+  // coroutine can cause miscompilation because the coroutine CFG misrepresents
+  // the true control flow of the program: things that happen in the
+  // await_suspend are not guaranteed to happen prior to the resumption of the
+  // coroutine, and things that happen after the resumption of the coroutine
+  // (including its exit and the potential deallocation of the coroutine frame)
+  // are not guaranteed to happen only after the end of await_suspend.
+  //
+  // The short-term solution to this problem is to mark the call as uninlinable.
+  // But we don't want to do this if the call is known to be trivial, which is
+  // very common.
+  //
+  // The long-term solution may introduce patterns like:
+  //
+  //  call @llvm.coro.await_suspend(ptr %awaiter, ptr %handle,
+  //                                ptr @awaitSuspendFn)
+  //
+  // Then it is much easier to perform the safety analysis in the middle end.
+  // If it is safe to inline the call to awaitSuspend, we can replace it in the
+  // CoroEarly pass. Otherwise we could replace it in the CoroSplit pass.
+  if (inSuspendBlock() && mayCoroHandleEscape())
+    Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
+
   // Disable inlining inside SEH __try blocks.
   if (isSEHTryScope()) {
     Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 8437cda79beb2..810ae7d51ec10 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -139,6 +139,36 @@ static bool memberCallExpressionCanThrow(const Expr *E) {
   return true;
 }
 
+/// Return true when the coroutine handle may escape from the await-suspend
+/// (`awaiter.await_suspend(std::coroutine_handle)` expression).
+/// Return false only when the coroutine wouldn't escape in the await-suspend
+/// for sure.
+///
+/// While it is always safe to return true, return falses can bring better
+/// performances.
+///
+/// See https://github.com/llvm/llvm-project/issues/56301 and
+/// https://reviews.llvm.org/D157070 for the example and the full discussion.
+///
+/// FIXME: It will be much better to perform such analysis in the middle end.
+/// See the comments in `CodeGenFunction::EmitCall` for example.
+static bool MayCoroHandleEscape(CoroutineSuspendExpr const &S) {
+  CXXRecordDecl *Awaiter =
+      S.getCommonExpr()->getType().getNonReferenceType()->getAsCXXRecordDecl();
+
+  // Return true conservatively if the awaiter type is not a record type.
+  if (!Awaiter)
+    return true;
+
+  // In case the awaiter type is empty, the suspend wouldn't leak the coroutine
+  // handle.
+  //
+  // TODO: We can improve this by looking into the implementation of
+  // await-suspend and see if the coroutine handle is passed to foreign
+  // functions.
+  return !Awaiter->field_empty();
+}
+
 // Emit suspend expression which roughly looks like:
 //
 //   auto && x = CommonExpr();
@@ -199,8 +229,11 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
   auto *SaveCall = Builder.CreateCall(CoroSave, {NullPtr});
 
   CGF.CurCoro.InSuspendBlock = true;
+  CGF.CurCoro.MayCoroHandleEscape = MayCoroHandleEscape(S);
   auto *SuspendRet = CGF.EmitScalarExpr(S.getSuspendExpr());
   CGF.CurCoro.InSuspendBlock = false;
+  CGF.CurCoro.MayCoroHandleEscape = false;
+
   if (SuspendRet != nullptr && SuspendRet->getType()->isIntegerTy(1)) {
     // Veto suspension if requested by bool returning await_suspend.
     BasicBlock *RealSuspendBlock =
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 8722fd4550e4a..28ec2b9700721 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -334,6 +334,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   struct CGCoroInfo {
     std::unique_ptr<CGCoroData> Data;
     bool InSuspendBlock = false;
+    bool MayCoroHandleEscape = false;
     CGCoroInfo();
     ~CGCoroInfo();
   };
@@ -347,6 +348,10 @@ class CodeGenFunction : public CodeGenTypeCache {
     return isCoroutine() && CurCoro.InSuspendBlock;
   }
 
+  bool mayCoroHandleEscape() const {
+    return isCoroutine() && CurCoro.MayCoroHandleEscape;
+  }
+
   /// CurGD - The GlobalDecl for the current function being compiled.
   GlobalDecl CurGD;
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a3506df7d4e5a..f09d1129b128a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2386,7 +2386,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   // functions. If the current target's C++ ABI requires this and this is a
   // member function, set its alignment accordingly.
   if (getTarget().getCXXABI().areMemberFunctionsAligned()) {
-    if (F->getPointerAlignment(getDataLayout()) < 2 && isa<CXXMethodDecl>(D))
+    if (isa<CXXMethodDecl>(D) && F->getPointerAlignment(getDataLayout()) < 2)
       F->setAlignment(std::max(llvm::Align(2), F->getAlign().valueOrOne()));
   }
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f6ea4d0b43667..bdbdad9362e19 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4936,6 +4936,12 @@ void Driver::BuildJobs(Compilation &C) const {
   (void)C.getArgs().hasArg(options::OPT_driver_mode);
   (void)C.getArgs().hasArg(options::OPT_rsp_quoting);
 
+  bool HasAssembleJob = llvm::any_of(C.getJobs(), [](auto &J) {
+    // Match ClangAs and other derived assemblers of Tool. ClangAs uses a
+    // longer ShortName "clang integrated assembler" while other assemblers just
+    // use "assembler".
+    return strstr(J.getCreator().getShortName(), "assembler");
+  });
   for (Arg *A : C.getArgs()) {
     // FIXME: It would be nice to be able to send the argument to the
     // DiagnosticsEngine, so that extra values, position, and so on could be
@@ -4965,7 +4971,7 @@ void Driver::BuildJobs(Compilation &C) const {
       // already been warned about.
       if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) {
         if (A->getOption().hasFlag(options::TargetSpecific) &&
-            !A->isIgnoredTargetSpecific()) {
+            !A->isIgnoredTargetSpecific() && !HasAssembleJob) {
           Diag(diag::err_drv_unsupported_opt_for_target)
               << A->getSpelling() << getTargetTriple();
         } else {
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index c3ce13f93464d..12fe55be9113e 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -37,6 +37,8 @@ static const SanitizerMask NeedsUbsanCxxRt =
     SanitizerKind::Vptr | SanitizerKind::CFI;
 static const SanitizerMask NotAllowedWithTrap = SanitizerKind::Vptr;
 static const SanitizerMask NotAllowedWithMinimalRuntime = SanitizerKind::Vptr;
+static const SanitizerMask NotAllowedWithExecuteOnly =
+    SanitizerKind::Function | SanitizerKind::KCFI;
 static const SanitizerMask RequiresPIE =
     SanitizerKind::DataFlow | SanitizerKind::Scudo;
 static const SanitizerMask NeedsUnwindTables =
@@ -141,6 +143,16 @@ static std::string describeSanitizeArg(const llvm::opt::Arg *A,
 /// Sanitizers set.
 static std::string toString(const clang::SanitizerSet &Sanitizers);
 
+/// Return true if an execute-only target disallows data access to code
+/// sections.
+static bool isExecuteOnlyTarget(const llvm::Triple &Triple,
+                                const llvm::opt::ArgList &Args) {
+  if (Triple.isPS5())
+    return true;
+  return Args.hasFlagNoClaim(options::OPT_mexecute_only,
+                             options::OPT_mno_execute_only, false);
+}
+
 static void validateSpecialCaseListFormat(const Driver &D,
                                           std::vector<std::string> &SCLFiles,
                                           unsigned MalformedSCLErrorDiagID,
@@ -395,6 +407,22 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
           DiagnosedKinds |= SanitizerKind::Function;
         }
       }
+      // -fsanitize=function and -fsanitize=kcfi instrument indirect function
+      // calls to load a type hash before the function label. Therefore, an
+      // execute-only target doesn't support the function and kcfi sanitizers.
+      const llvm::Triple &Triple = TC.getTriple();
+      if (isExecuteOnlyTarget(Triple, Args)) {
+        if (SanitizerMask KindsToDiagnose =
+                Add & NotAllowedWithExecuteOnly & ~DiagnosedKinds) {
+          if (DiagnoseErrors) {
+            std::string Desc = describeSanitizeArg(Arg, KindsToDiagnose);
+            D.Diag(diag::err_drv_argument_not_allowed_with)
+                << Desc << Triple.str();
+          }
+          DiagnosedKinds |= KindsToDiagnose;
+        }
+        Add &= ~NotAllowedWithExecuteOnly;
+      }
 
       // FIXME: Make CFI on member function calls compatible with cross-DSO CFI.
       // There are currently two problems:
@@ -457,6 +485,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
       if (MinimalRuntime) {
         Add &= ~NotAllowedWithMinimalRuntime;
       }
+      // NotAllowedWithExecuteOnly is silently discarded on an execute-only
+      // target if implicitly enabled through group expansion.
+      if (isExecuteOnlyTarget(Triple, Args))
+        Add &= ~NotAllowedWithExecuteOnly;
       if (CfiCrossDso)
         Add &= ~SanitizerKind::CFIMFCall;
       Add &= Supported;
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 97217eba9ca01..bfc86d9f34718 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -30,6 +30,7 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfoList &Inputs,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {
+  const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   const bool IsArch32Bit = getToolChain().getTriple().isArch32Bit();
@@ -38,6 +39,11 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   if (!IsArch32Bit && !IsArch64Bit)
     llvm_unreachable("Unsupported bit width value.");
 
+  if (Arg *A = C.getArgs().getLastArg(options::OPT_G)) {
+    D.Diag(diag::err_drv_unsupported_opt_for_target)
+        << A->getSpelling() << D.getTargetTriple();
+  }
+
   // Specify the mode in which the as(1) command operates.
   if (IsArch32Bit) {
     CmdArgs.push_back("-a32");
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 86c789f5fcef5..de5a69e4ca3fd 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -47,11 +47,24 @@ void solaris::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                          Exec, CmdArgs, Inputs, Output));
 }
 
+static bool getPIE(const ArgList &Args, const ToolChain &TC) {
+  if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_static) ||
+      Args.hasArg(options::OPT_r))
+    return false;
+
+  Arg *A = Args.getLastArg(options::OPT_pie, options::OPT_no_pie,
+                           options::OPT_nopie);
+  if (!A)
+    return TC.isPIEDefault(Args);
+  return A->getOption().matches(options::OPT_pie);
+}
+
 void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
+  const bool IsPIE = getPIE(Args, getToolChain());
   ArgStringList CmdArgs;
 
   // Demangle C++ names in errors
@@ -62,6 +75,11 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("_start");
   }
 
+  if (IsPIE) {
+    CmdArgs.push_back("-z");
+    CmdArgs.push_back("type=pie");
+  }
+
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
@@ -113,8 +131,13 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       values_xpg = "values-xpg4.o";
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath(values_xpg)));
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
+
+    const char *crtbegin = nullptr;
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      crtbegin = "crtbeginS.o";
+    else
+      crtbegin = "crtbegin.o";
+    CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(crtbegin)));
     // Add crtfastmath.o if available and fast math is enabled.
     getToolChain().addFastMathRuntimeIfAvailable(Args, CmdArgs);
   }
@@ -171,8 +194,12 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
                    options::OPT_r)) {
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtendS.o")));
+    else
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 32619bc56f7a3..852437b9390fc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -581,7 +581,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
           ProbablyBracedList =
               ProbablyBracedList ||
               (NextTok->is(tok::l_brace) && LBraceStack.back().PrevTok &&
-               LBraceStack.back().PrevTok->is(tok::identifier));
+               LBraceStack.back().PrevTok->isOneOf(tok::identifier,
+                                                   tok::greater));
 
           ProbablyBracedList =
               ProbablyBracedList ||
@@ -2464,7 +2465,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         const auto *PrevPrev = Prev ? Prev->getPreviousNonComment() : nullptr;
         const bool Blacklisted =
             PrevPrev &&
-            (PrevPrev->is(tok::kw___attribute) ||
+            (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
              (SeenEqual &&
               (PrevPrev->isOneOf(tok::kw_if, tok::kw_while) ||
                PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index c6f958a6077bf..0bd4b01ff79db 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/FileEntry.h"
 #include "clang/Basic/LangStandard.h"
 #include "clang/Basic/Sarif.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -1150,6 +1151,10 @@ void ASTFrontendAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
   if (!CI.hasPreprocessor())
     return;
+  // This is a fallback: If the client forgets to invoke this, we mark the
+  // current stack as the bottom. Though not optimal, this could help prevent
+  // stack overflow during deep recursion.
+  clang::noteBottomOfStack();
 
   // FIXME: Move the truncation aspect of this into Sema, we delayed this till
   // here so the source manager would be initialized.
diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 3f8d60630de41..2c4dfc9a611e0 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -92,12 +92,19 @@ llvm::Error IncrementalExecutor::runCtors() const {
 llvm::Expected<llvm::orc::ExecutorAddr>
 IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
                                       SymbolNameKind NameKind) const {
-  auto Sym = (NameKind == LinkerName) ? Jit->lookupLinkerMangled(Name)
-                                      : Jit->lookup(Name);
-
-  if (!Sym)
-    return Sym.takeError();
-  return Sym;
+  using namespace llvm::orc;
+  auto SO = makeJITDylibSearchOrder({&Jit->getMainJITDylib(),
+                                     Jit->getPlatformJITDylib().get(),
+                                     Jit->getProcessSymbolsJITDylib().get()});
+
+  ExecutionSession &ES = Jit->getExecutionSession();
+
+  auto SymOrErr =
+      ES.lookup(SO, (NameKind == LinkerName) ? ES.intern(Name)
+                                             : Jit->mangleAndIntern(Name));
+  if (auto Err = SymOrErr.takeError())
+    return std::move(Err);
+  return SymOrErr->getAddress();
 }
 
 } // end namespace clang
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 423d5372a6f65..1cff4a75790ec 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
@@ -9072,16 +9073,24 @@ Sema::BuildExprRequirement(
     MultiLevelTemplateArgumentList MLTAL(Param, TAL.asArray(),
                                          /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
-    Expr *IDC = Param->getTypeConstraint()->getImmediatelyDeclaredConstraint();
+    const TypeConstraint *TC = Param->getTypeConstraint();
+    assert(TC && "Type Constraint cannot be null here");
+    auto *IDC = TC->getImmediatelyDeclaredConstraint();
+    assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
     ExprResult Constraint = SubstExpr(IDC, MLTAL);
     if (Constraint.isInvalid()) {
-      Status = concepts::ExprRequirement::SS_ExprSubstitutionFailure;
-    } else {
-      SubstitutedConstraintExpr =
-          cast<ConceptSpecializationExpr>(Constraint.get());
-      if (!SubstitutedConstraintExpr->isSatisfied())
-        Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
-    }
+      return new (Context) concepts::ExprRequirement(
+          concepts::createSubstDiagAt(*this, IDC->getExprLoc(),
+                                      [&](llvm::raw_ostream &OS) {
+                                        IDC->printPretty(OS, /*Helper=*/nullptr,
+                                                         getPrintingPolicy());
+                                      }),
+          IsSimple, NoexceptLoc, ReturnTypeRequirement);
+    }
+    SubstitutedConstraintExpr =
+        cast<ConceptSpecializationExpr>(Constraint.get());
+    if (!SubstitutedConstraintExpr->isSatisfied())
+      Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
   return new (Context) concepts::ExprRequirement(E, IsSimple, NoexceptLoc,
                                                  ReturnTypeRequirement, Status,
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8702e2ca3a1b3..394006a57747d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2276,9 +2276,9 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType(
       getPackIndex(Pack), Arg, TL.getNameLoc());
 }
 
-template<typename EntityPrinter>
 static concepts::Requirement::SubstitutionDiagnostic *
-createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
+createSubstDiag(Sema &S, TemplateDeductionInfo &Info,
+                concepts::EntityPrinter Printer) {
   SmallString<128> Message;
   SourceLocation ErrorLoc;
   if (Info.hasSFINAEDiagnostic()) {
@@ -2302,6 +2302,19 @@ createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
       StringRef(MessageBuf, Message.size())};
 }
 
+concepts::Requirement::SubstitutionDiagnostic *
+concepts::createSubstDiagAt(Sema &S, SourceLocation Location,
+                            EntityPrinter Printer) {
+  SmallString<128> Entity;
+  llvm::raw_svector_ostream OS(Entity);
+  Printer(OS);
+  char *EntityBuf = new (S.Context) char[Entity.size()];
+  llvm::copy(Entity, EntityBuf);
+  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
+      /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()),
+      /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()};
+}
+
 ExprResult TemplateInstantiator::TransformRequiresTypeParams(
     SourceLocation KWLoc, SourceLocation RBraceLoc, const RequiresExpr *RE,
     RequiresExprBodyDecl *Body, ArrayRef<ParmVarDecl *> Params,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 10b3587885e39..097e81ea7d45a 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7478,6 +7478,10 @@ StmtResult
 TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
                                               bool IsStmtExpr) {
   Sema::CompoundScopeRAII CompoundScope(getSema());
+  Sema::FPFeaturesStateRAII FPSave(getSema());
+  if (S->hasStoredFPFeatures())
+    getSema().resetFPOptions(
+        S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
   const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 59dbc36d24e8c..8dd78152bd687 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -580,7 +580,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 30,
+  static_assert(DeclContext::NumFunctionDeclBits == 31,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -1495,7 +1495,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
 }
 
 void ASTDeclWriter::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
-  static_assert(DeclContext::NumCXXConstructorDeclBits == 21,
+  static_assert(DeclContext::NumCXXConstructorDeclBits == 20,
                 "You need to update the serializer after you change the "
                 "CXXConstructorDeclBits");
 
diff --git a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
index 5293f5e0a522d..0326798e3a174 100644
--- a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
@@ -26,4 +26,5 @@ add_clang_library(clangStaticAnalyzerFrontend
 
   DEPENDS
   omp_gen
+  ClangDriverOptions
   )
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
index a08ec11e77a4a..b46bd2e4d7a4b 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
@@ -3773,6 +3773,33 @@ SYMBOL(viewable_range, std::ranges::, <ranges>)
 SYMBOL(wistream_view, std::ranges::, <ranges>)
 SYMBOL(zip_transform_view, std::ranges::, <ranges>)
 SYMBOL(zip_view, std::ranges::, <ranges>)
+SYMBOL(all, std::ranges::views::, <ranges>)
+SYMBOL(all_t, std::ranges::views::, <ranges>)
+SYMBOL(as_const, std::ranges::views::, <ranges>)
+SYMBOL(as_rvalue, std::ranges::views::, <ranges>)
+SYMBOL(common, std::ranges::views::, <ranges>)
+SYMBOL(counted, std::ranges::views::, <ranges>)
+SYMBOL(drop, std::ranges::views::, <ranges>)
+SYMBOL(drop_while, std::ranges::views::, <ranges>)
+SYMBOL(elements, std::ranges::views::, <ranges>)
+SYMBOL(empty, std::ranges::views::, <ranges>)
+SYMBOL(filter, std::ranges::views::, <ranges>)
+SYMBOL(iota, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <iosfwd>)
+SYMBOL(join, std::ranges::views::, <ranges>)
+SYMBOL(join_with, std::ranges::views::, <ranges>)
+SYMBOL(keys, std::ranges::views::, <ranges>)
+SYMBOL(lazy_split, std::ranges::views::, <ranges>)
+SYMBOL(reverse, std::ranges::views::, <ranges>)
+SYMBOL(single, std::ranges::views::, <ranges>)
+SYMBOL(split, std::ranges::views::, <ranges>)
+SYMBOL(take, std::ranges::views::, <ranges>)
+SYMBOL(take_while, std::ranges::views::, <ranges>)
+SYMBOL(transform, std::ranges::views::, <ranges>)
+SYMBOL(values, std::ranges::views::, <ranges>)
+SYMBOL(zip, std::ranges::views::, <ranges>)
+SYMBOL(zip_transform, std::ranges::views::, <ranges>)
 SYMBOL(ECMAScript, std::regex_constants::, <regex>)
 SYMBOL(awk, std::regex_constants::, <regex>)
 SYMBOL(basic, std::regex_constants::, <regex>)
@@ -3817,3 +3844,30 @@ SYMBOL(get_id, std::this_thread::, <thread>)
 SYMBOL(sleep_for, std::this_thread::, <thread>)
 SYMBOL(sleep_until, std::this_thread::, <thread>)
 SYMBOL(yield, std::this_thread::, <thread>)
+SYMBOL(all, std::views::, <ranges>)
+SYMBOL(all_t, std::views::, <ranges>)
+SYMBOL(as_const, std::views::, <ranges>)
+SYMBOL(as_rvalue, std::views::, <ranges>)
+SYMBOL(common, std::views::, <ranges>)
+SYMBOL(counted, std::views::, <ranges>)
+SYMBOL(drop, std::views::, <ranges>)
+SYMBOL(drop_while, std::views::, <ranges>)
+SYMBOL(elements, std::views::, <ranges>)
+SYMBOL(empty, std::views::, <ranges>)
+SYMBOL(filter, std::views::, <ranges>)
+SYMBOL(iota, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <iosfwd>)
+SYMBOL(join, std::views::, <ranges>)
+SYMBOL(join_with, std::views::, <ranges>)
+SYMBOL(keys, std::views::, <ranges>)
+SYMBOL(lazy_split, std::views::, <ranges>)
+SYMBOL(reverse, std::views::, <ranges>)
+SYMBOL(single, std::views::, <ranges>)
+SYMBOL(split, std::views::, <ranges>)
+SYMBOL(take, std::views::, <ranges>)
+SYMBOL(take_while, std::views::, <ranges>)
+SYMBOL(transform, std::views::, <ranges>)
+SYMBOL(values, std::views::, <ranges>)
+SYMBOL(zip, std::views::, <ranges>)
+SYMBOL(zip_transform, std::views::, <ranges>)
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
index f4235795a8622..f8d755992eeac 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
@@ -90,3 +90,36 @@ void baz(__rvv_int32m1x2_t v_tuple) {
 __rvv_int32m1x2_t qux(__rvv_int32m1x2_t v_tuple) {
   return v_tuple;
 }
+
+// O0-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// O0-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[V_TUPLE_ADDR:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[COERCE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// O0-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], ptr [[V_TUPLE]], align 4
+// O0-NEXT:    [[V_TUPLE1:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[V_TUPLE1]], ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    [[TMP2:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_TUPLE:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 0
+// O0-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 1
+// O0-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// O0-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+// AFTER_MEM2REG-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// AFTER_MEM2REG-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// AFTER_MEM2REG-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// AFTER_MEM2REG-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// AFTER_MEM2REG-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+__rvv_int32m1x2_t quux(__rvv_int32m1x2_t v_tuple) {
+  return qux(v_tuple);
+}
diff --git a/clang/test/CodeGen/attr-target-mv-va-args.c b/clang/test/CodeGen/attr-target-mv-va-args.c
index e75796d7ee038..96821c610235b 100644
--- a/clang/test/CodeGen/attr-target-mv-va-args.c
+++ b/clang/test/CodeGen/attr-target-mv-va-args.c
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=LINUX
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=IFUNC-ELF
+// RUN: %clang_cc1 -triple x86_64-pc-freebsd -emit-llvm %s -o - | FileCheck %s --check-prefix=IFUNC-ELF
 // RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,WINDOWS
-// RUN: %clang_cc1 -triple x86_64-fuchsia -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,FUCHSIA
+// RUN: %clang_cc1 -triple x86_64-linux-musl -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,NO-IFUNC-ELF
+// RUN: %clang_cc1 -triple x86_64-fuchsia -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,NO-IFUNC-ELF
 int __attribute__((target("sse4.2"))) foo(int i, ...) { return 0; }
 int __attribute__((target("arch=sandybridge"))) foo(int i, ...);
 int __attribute__((target("arch=ivybridge"))) foo(int i, ...) {return 1;}
@@ -10,23 +12,23 @@ int bar(void) {
   return foo(1, 'a', 1.1) + foo(2, 2.2, "asdf");
 }
 
-// LINUX: @foo.ifunc = weak_odr ifunc i32 (i32, ...), ptr @foo.resolver
-// LINUX: define{{.*}} i32 @foo.sse4.2(i32 noundef %i, ...)
-// LINUX: ret i32 0
-// LINUX: define{{.*}} i32 @foo.arch_ivybridge(i32 noundef %i, ...)
-// LINUX: ret i32 1
-// LINUX: define{{.*}} i32 @foo(i32 noundef %i, ...)
-// LINUX: ret i32 2
-// LINUX: define{{.*}} i32 @bar()
-// LINUX: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
-// LINUX: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
+// IFUNC-ELF: @foo.ifunc = weak_odr ifunc i32 (i32, ...), ptr @foo.resolver
+// IFUNC-ELF: define{{.*}} i32 @foo.sse4.2(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 0
+// IFUNC-ELF: define{{.*}} i32 @foo.arch_ivybridge(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 1
+// IFUNC-ELF: define{{.*}} i32 @foo(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 2
+// IFUNC-ELF: define{{.*}} i32 @bar()
+// IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
+// IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
-// LINUX: define weak_odr ptr @foo.resolver() comdat
-// LINUX: ret ptr @foo.arch_sandybridge
-// LINUX: ret ptr @foo.arch_ivybridge
-// LINUX: ret ptr @foo.sse4.2
-// LINUX: ret ptr @foo
-// LINUX: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
+// IFUNC-ELF: define weak_odr ptr @foo.resolver() comdat
+// IFUNC-ELF: ret ptr @foo.arch_sandybridge
+// IFUNC-ELF: ret ptr @foo.arch_ivybridge
+// IFUNC-ELF: ret ptr @foo.sse4.2
+// IFUNC-ELF: ret ptr @foo
+// IFUNC-ELF: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
 
 // NO-IFUNC: define dso_local i32 @foo.sse4.2(i32 noundef %i, ...)
 // NO-IFUNC: ret i32 0
@@ -39,10 +41,10 @@ int bar(void) {
 // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
 // WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) comdat
-// FUCHSIA: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
+// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_sandybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_ivybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.sse4.2
 // NO-IFUNC: musttail call i32 (i32, ...) @foo
 // WINDOWS: declare dso_local i32 @foo.arch_sandybridge(i32 noundef, ...)
-// FUCHSIA: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
+// NO-IFUNC-ELF: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index 731c2c143d7b7..65cf9db80b917 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -1,7 +1,7 @@
 // This test checks if internal linkage symbols get unique names with
 // -funique-internal-linkage-names option.
-// RUN: %clang_cc1 -triple x86_64 -x c++ -S -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
-// RUN: %clang_cc1 -triple x86_64 -x c++  -S -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -S -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++  -S -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
 
 static int glob;
 static int foo() {
diff --git a/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp b/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp
new file mode 100644
index 0000000000000..f935e256d9db9
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp
@@ -0,0 +1,207 @@
+// Tests that we can mark await-suspend as noinline correctly.
+//
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \
+// RUN:     -disable-llvm-passes | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_void() noexcept {}
+
+    std::coroutine_handle<> continuation;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle);
+  ~Task();
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+struct StatefulAwaiter {
+    int value;
+    bool await_ready() const noexcept { return false; }
+    template <typename PromiseType>
+    void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+    void await_resume() noexcept {}
+};
+
+typedef std::suspend_always NoStateAwaiter;
+using AnotherStatefulAwaiter = StatefulAwaiter;
+
+template <class T>
+struct TemplatedAwaiter {
+    T value;
+    bool await_ready() const noexcept { return false; }
+    template <typename PromiseType>
+    void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+    void await_resume() noexcept {}
+};
+
+
+class Awaitable {};
+StatefulAwaiter operator co_await(Awaitable) {
+  return StatefulAwaiter{};
+}
+
+StatefulAwaiter GlobalAwaiter;
+class Awaitable2 {};
+StatefulAwaiter& operator co_await(Awaitable2) {
+  return GlobalAwaiter;
+}
+
+Task testing() {
+    co_await std::suspend_always{};
+    co_await StatefulAwaiter{};
+    co_await AnotherStatefulAwaiter{};
+    
+    // Test lvalue case.
+    StatefulAwaiter awaiter;
+    co_await awaiter;
+
+    // The explicit call to await_suspend is not considered suspended.
+    awaiter.await_suspend(std::coroutine_handle<void>::from_address(nullptr));
+
+    co_await TemplatedAwaiter<int>{};
+    TemplatedAwaiter<int> TemplatedAwaiterInstace;
+    co_await TemplatedAwaiterInstace;
+
+    co_await Awaitable{};
+    co_await Awaitable2{};
+}
+
+// CHECK-LABEL: @_Z7testingv
+
+// Check `co_await __promise__.initial_suspend();` Since it returns std::suspend_always,
+// which is an empty class, we shouldn't generate optimization blocker for it.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR:[0-9]+]]
+
+// Check the `co_await std::suspend_always{};` expression. We shouldn't emit the optimization
+// blocker for it since it is an empty class.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await StatefulAwaiter{};`. We need to emit the optimization blocker since
+// the awaiter is not empty.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR:[0-9]+]]
+
+// Check `co_await AnotherStatefulAwaiter{};` to make sure that we can handle TypedefTypes.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await awaiter;` to make sure we can handle lvalue cases.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `awaiter.await_suspend(...)` to make sure the explicit call the await_suspend won't be marked as noinline
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIvEEvSt16coroutine_handleIT_E{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await TemplatedAwaiter<int>{};` to make sure we can handle specialized template
+// type.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN16TemplatedAwaiterIiE13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await TemplatedAwaiterInstace;` to make sure we can handle the lvalue from
+// specialized template type.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN16TemplatedAwaiterIiE13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await Awaitable{};` to make sure we can handle awaiter returned by
+// `operator co_await`;
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await Awaitable2{};` to make sure we can handle awaiter returned by
+// `operator co_await` which returns a reference;
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await __promise__.final_suspend();`. We don't emit an blocker here since it is
+// empty.
+// CHECK: call token @llvm.coro.save
+// CHECK: call ptr @_ZN4Task12promise_type12FinalAwaiter13await_suspendIS0_EESt16coroutine_handleIvES3_IT_E{{.*}}#[[NORMAL_ATTR]]
+
+struct AwaitTransformTask {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+
+    AwaitTransformTask get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_void() noexcept {}
+
+    template <typename Awaitable>
+    auto await_transform(Awaitable &&awaitable) {
+      return awaitable;
+    }
+
+    std::coroutine_handle<> continuation;
+  };
+
+  AwaitTransformTask(std::coroutine_handle<promise_type> handle);
+  ~AwaitTransformTask();
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+struct awaitableWithGetAwaiter {
+  bool await_ready() const noexcept { return false; }
+  template <typename PromiseType>
+  void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+  void await_resume() noexcept {}
+};
+
+AwaitTransformTask testingWithAwaitTransform() {
+  co_await awaitableWithGetAwaiter{};
+}
+
+// CHECK-LABEL: @_Z25testingWithAwaitTransformv
+
+// Init suspend
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await awaitableWithGetAwaiter{};`.
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// Check call void @_ZN23awaitableWithGetAwaiter13await_suspendIN18AwaitTransformTask12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NORMAL_ATTR]]
+
+// Final suspend
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// CHECK: call ptr @_ZN18AwaitTransformTask12promise_type12FinalAwaiter13await_suspendIS0_EESt16coroutine_handleIvES3_IT_E{{.*}}#[[NORMAL_ATTR]]
+
+// CHECK-NOT: attributes #[[NORMAL_ATTR]] = noinline
+// CHECK: attributes #[[NOINLINE_ATTR]] = {{.*}}noinline
diff --git a/clang/test/CodeGenCoroutines/coro-halo.cpp b/clang/test/CodeGenCoroutines/coro-halo.cpp
index 6244f130b7be2..e75bedaf81fa2 100644
--- a/clang/test/CodeGenCoroutines/coro-halo.cpp
+++ b/clang/test/CodeGenCoroutines/coro-halo.cpp
@@ -1,5 +1,7 @@
 // This tests that the coroutine heap allocation elision optimization could happen succesfully.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O2 -emit-llvm %s \
+// RUN:   -fcxx-exceptions -fexceptions -o - | FileCheck %s
 
 #include "Inputs/coroutine.h"
 #include "Inputs/numeric.h"
diff --git a/clang/test/CodeGenCoroutines/pr56301.cpp b/clang/test/CodeGenCoroutines/pr56301.cpp
new file mode 100644
index 0000000000000..cd851c0b815db
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/pr56301.cpp
@@ -0,0 +1,85 @@
+// An end-to-end test to make sure things get processed correctly.
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s -O3 | \
+// RUN:     FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct SomeAwaitable {
+  // Resume the supplied handle once the awaitable becomes ready,
+  // returning a handle that should be resumed now for the sake of symmetric transfer.
+  // If the awaitable is already ready, return an empty handle without doing anything.
+  //
+  // Defined in another translation unit. Note that this may contain
+  // code that synchronizees with another thread.
+  std::coroutine_handle<> Register(std::coroutine_handle<>);
+};
+
+// Defined in another translation unit.
+void DidntSuspend();
+
+struct Awaiter {
+  SomeAwaitable&& awaitable;
+  bool suspended;
+
+  bool await_ready() { return false; }
+
+  std::coroutine_handle<> await_suspend(const std::coroutine_handle<> h) {
+    // Assume we will suspend unless proven otherwise below. We must do
+    // this *before* calling Register, since we may be destroyed by another
+    // thread asynchronously as soon as we have registered.
+    suspended = true;
+
+    // Attempt to hand off responsibility for resuming/destroying the coroutine.
+    const auto to_resume = awaitable.Register(h);
+
+    if (!to_resume) {
+      // The awaitable is already ready. In this case we know that Register didn't
+      // hand off responsibility for the coroutine. So record the fact that we didn't
+      // actually suspend, and tell the compiler to resume us inline.
+      suspended = false;
+      return h;
+    }
+
+    // Resume whatever Register wants us to resume.
+    return to_resume;
+  }
+
+  void await_resume() {
+    // If we didn't suspend, make note of that fact.
+    if (!suspended) {
+      DidntSuspend();
+    }
+  }
+};
+
+struct MyTask{
+  struct promise_type {
+    MyTask get_return_object() { return {}; }
+    std::suspend_never initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void unhandled_exception();
+
+    Awaiter await_transform(SomeAwaitable&& awaitable) {
+      return Awaiter{static_cast<SomeAwaitable&&>(awaitable)};
+    }
+  };
+};
+
+MyTask FooBar() {
+  co_await SomeAwaitable();
+}
+
+// CHECK-LABEL: @_Z6FooBarv
+// CHECK: %[[to_resume:.*]] = {{.*}}call ptr @_ZN13SomeAwaitable8RegisterESt16coroutine_handleIvE
+// CHECK-NEXT: %[[to_bool:.*]] = icmp eq ptr %[[to_resume]], null
+// CHECK-NEXT: br i1 %[[to_bool]], label %[[then:.*]], label %[[else:.*]]
+
+// CHECK: [[then]]:
+// We only access the coroutine frame conditionally as the sources did.
+// CHECK:   store i8 0,
+// CHECK-NEXT: br label %[[else]]
+
+// CHECK: [[else]]:
+// No more access to the coroutine frame until suspended.
+// CHECK-NOT: store
+// CHECK: }
diff --git a/clang/test/CodeGenCoroutines/pr59723.cpp b/clang/test/CodeGenCoroutines/pr59723.cpp
new file mode 100644
index 0000000000000..7fc9995f417ac
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/pr59723.cpp
@@ -0,0 +1,237 @@
+// This is reduced test case from https://github.com/llvm/llvm-project/issues/59723.
+// This is not a minimal reproducer intentionally to check the compiler's ability.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fcxx-exceptions\
+// RUN:     -fexceptions -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+// executor and operation base
+
+class bug_any_executor;
+
+struct bug_async_op_base
+{
+	void invoke();
+
+protected:
+
+	~bug_async_op_base() = default;
+};
+
+class bug_any_executor
+{
+	using op_type = bug_async_op_base;
+
+public:
+
+	virtual ~bug_any_executor() = default;
+
+	// removing noexcept enables clang to find that the pointer has escaped
+	virtual void post(op_type& op) noexcept = 0;
+
+	virtual void wait() noexcept = 0;
+};
+
+class bug_thread_executor : public bug_any_executor
+{
+
+public:
+
+	void start()
+	{
+		
+	}
+
+	~bug_thread_executor()
+	{
+	}
+
+	// although this implementation is not realy noexcept due to allocation but I have a real one that is and required to be noexcept
+	virtual void post(bug_async_op_base& op) noexcept override;
+
+	virtual void wait() noexcept override
+	{
+		
+	}
+};
+
+// task and promise
+
+struct bug_final_suspend_notification
+{
+	virtual std::coroutine_handle<> get_waiter() = 0;
+};
+
+class bug_task;
+
+class bug_task_promise
+{
+	friend bug_task;
+public:
+
+	bug_task get_return_object() noexcept;
+
+	constexpr std::suspend_always initial_suspend() noexcept { return {}; }
+
+	std::suspend_always final_suspend() noexcept 
+	{
+		return {};
+	}
+
+	void unhandled_exception() noexcept;
+
+	constexpr void return_void() const noexcept {}
+
+	void get_result() const
+	{
+		
+	}
+};
+
+template <class T, class U>
+T exchange(T &&t, U &&u) {
+    T ret = t;
+    t = u;
+    return ret;
+}
+
+class bug_task
+{
+	friend bug_task_promise;
+	using handle = std::coroutine_handle<>;
+	using promise_t = bug_task_promise;
+
+	bug_task(handle coro, promise_t* p) noexcept : this_coro{ coro }, this_promise{ p }
+	{
+	
+	}
+
+public:
+	using promise_type = bug_task_promise;
+
+    bug_task(bug_task&& other) noexcept
+		: this_coro{ exchange(other.this_coro, nullptr) }, this_promise{ exchange(other.this_promise, nullptr) } { 
+		
+	}
+
+	~bug_task()
+	{
+		if (this_coro)
+			this_coro.destroy();
+	}
+
+	constexpr bool await_ready() const noexcept
+	{
+		return false;
+	}
+
+	handle await_suspend(handle waiter) noexcept
+	{
+		return this_coro;
+	}
+
+	void await_resume() 
+	{
+		return this_promise->get_result();
+	}
+
+	handle this_coro;
+	promise_t* this_promise;
+};
+
+bug_task bug_task_promise::get_return_object() noexcept
+{
+	return { std::coroutine_handle<bug_task_promise>::from_promise(*this), this };
+}
+
+// spawn operation and spawner
+
+template<class Handler>
+class bug_spawn_op final : public bug_async_op_base, bug_final_suspend_notification
+{
+	Handler handler;
+	bug_task task_;
+
+public:
+
+	bug_spawn_op(Handler handler, bug_task&& t)
+		: handler { handler }, task_{ static_cast<bug_task&&>(t) } {}
+
+	virtual std::coroutine_handle<> get_waiter() override
+	{
+		handler();
+		return std::noop_coroutine();
+	}
+};
+
+class bug_spawner;
+
+struct bug_spawner_awaiter
+{
+	bug_spawner& s;
+	std::coroutine_handle<> waiter;
+
+	bug_spawner_awaiter(bug_spawner& s) : s{ s } {}
+
+	bool await_ready() const noexcept;
+
+	void await_suspend(std::coroutine_handle<> coro);
+
+	void await_resume() {}
+};
+
+class bug_spawner
+{
+	friend bug_spawner_awaiter;
+
+	struct final_handler_t
+	{
+		bug_spawner& s;
+
+		void operator()()
+		{
+			s.awaiter_->waiter.resume();
+		}
+	};
+
+public:
+
+	bug_spawner(bug_any_executor& ex) : ex_{ ex } {}
+
+	void spawn(bug_task&& t) {
+		using op_t = bug_spawn_op<final_handler_t>;
+		// move task into ptr
+		op_t* ptr = new op_t(final_handler_t{ *this }, static_cast<bug_task&&>(t));
+		++count_;
+		ex_.post(*ptr); // ptr escapes here thus task escapes but clang can't deduce that unless post() is not noexcept
+	}
+
+	bug_spawner_awaiter wait() noexcept { return { *this }; }
+
+private:
+	bug_any_executor& ex_; // if bug_thread_executor& is used instead enables clang to detect the escape of the promise
+	bug_spawner_awaiter* awaiter_ = nullptr;
+	unsigned count_ = 0;
+};
+
+// test case
+
+bug_task bug_spawned_task(int id, int inc)
+{
+	co_return;
+}
+
+struct A {
+    A();
+};
+
+void throwing_fn(bug_spawner& s) {
+	s.spawn(bug_spawned_task(1, 2));
+    throw A{};
+}
+
+// Check that the coroutine frame of bug_spawned_task are allocated from operator new.
+// CHECK: define{{.*}}@_Z11throwing_fnR11bug_spawner
+// CHECK-NOT: alloc
+// CHECK: %[[CALL:.+]] = {{.*}}@_Znwm(i64{{.*}} 24)
+// CHECK: store ptr @_Z16bug_spawned_taskii.resume, ptr %[[CALL]]
diff --git a/clang/test/CodeGenObjCXX/crash-function-type.mm b/clang/test/CodeGenObjCXX/crash-function-type.mm
index 53acc58dfc44d..280497a3258a4 100644
--- a/clang/test/CodeGenObjCXX/crash-function-type.mm
+++ b/clang/test/CodeGenObjCXX/crash-function-type.mm
@@ -1,3 +1,6 @@
+// Mark test as unsupported on PS5 due to PS5 doesn't support function sanitizer.
+// UNSUPPORTED: target=x86_64-sie-ps5
+
 // RUN: %clang_cc1 -fblocks -fsanitize=function -emit-llvm %s -o %t
 
 void g(void (^)());
diff --git a/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbeginS.o b/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbeginS.o b/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 182de9f486444..9442f6b91471f 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -971,3 +971,17 @@
 
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined,function -mcmodel=large %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION-CODE-MODEL
 // CHECK-UBSAN-FUNCTION-CODE-MODEL: error: invalid argument '-fsanitize=function' only allowed with '-mcmodel=small'
+
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=undefined -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=kcfi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-KCFI
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=function -fsanitize=kcfi %s -### 2>&1 | FileCheck %s  --check-prefix=CHECK-UBSAN-KCFI --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-UNDEFINED
+
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=kcfi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-KCFI
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-UNDEFINED
+
+// CHECK-UBSAN-KCFI-DAG: error: invalid argument '-fsanitize=kcfi' not allowed with {{('x86_64-sie-ps5'|'armv6t2-unknown-unknown-eabi')}}
+// CHECK-UBSAN-FUNCTION-DAG: error: invalid argument '-fsanitize=function' not allowed with {{('x86_64-sie-ps5'|'armv6t2-unknown-unknown-eabi')}}
+// CHECK-UBSAN-UNDEFINED: "-fsanitize={{((alignment|array-bounds|bool|builtin|enum|float-cast-overflow|integer-divide-by-zero|nonnull-attribute|null|pointer-overflow|return|returns-nonnull-attribute|shift-base|shift-exponent|signed-integer-overflow|unreachable|vla-bound),?){17}"}}
diff --git a/clang/test/Driver/solaris-ld.c b/clang/test/Driver/solaris-ld.c
index 2127ad5ded074..8d97a5a3695bd 100644
--- a/clang/test/Driver/solaris-ld.c
+++ b/clang/test/Driver/solaris-ld.c
@@ -106,6 +106,33 @@
 // CHECK-SPARC32-SHARED-NOT: "-lgcc"
 // CHECK-SPARC32-SHARED-NOT: "-lm"
 
+// Check the right ld flags are present with -pie.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-PIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -nopie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+
+// Check that -shared/-r/-static disable PIE.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -shared -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -r -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -static -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+
+// CHECK-PIE: "-z" "type=pie"
+// CHECK-NOPIE-NOT: "-z" "type=pie"
+
 // -r suppresses default -l and crt*.o, values-*.o like -nostdlib.
 // RUN: %clang -### %s --target=sparc-sun-solaris2.11 -r 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-RELOCATABLE
@@ -115,6 +142,28 @@
 // CHECK-RELOCATABLE-NOT: /crt{{[^.]+}}.o
 // CHECK-RELOCATABLE-NOT: /values-{{[^.]+}}.o
 
+// Check that crt{begin,end}S.o is linked with -shared/-pie.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -shared \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -nopie \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -pie \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTS %s
+// CHECK-CRTS: crtbeginS.o
+// CHECK-CRTS: crtendS.o
+// CHECK-NOCRTS-NOT: crtbeginS.o
+// CHECK-NOCRTS-NOT: crtendS.o
+
 // Check that crtfastmath.o is linked with -ffast-math.
 
 // Check sparc-sun-solaris2.11, 32bit
diff --git a/clang/test/Driver/target-specific.s b/clang/test/Driver/target-specific.s
new file mode 100644
index 0000000000000..aa4fc73812099
--- /dev/null
+++ b/clang/test/Driver/target-specific.s
@@ -0,0 +1,12 @@
+/// Check that we report a warning instead of an error for target-specific compilation only options.
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard -c %s 2>&1 | FileCheck %s
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard -c -fno-integrated-as %s 2>&1 | FileCheck %s
+
+/// Report a warning if we perform the link phase.
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard %s 2>&1 | FileCheck %s
+
+// CHECK: warning: argument unused during compilation: '-faddrsig'
+// CHECK: warning: argument unused during compilation: '-mbranch-protection=standard'
+
+/// assembler-with-cpp claims compile only options. Ideally we should emit a warning.
+// RUN: %clang -### -Werror --target=aarch64 -c -faddrsig -mbranch-protection=standard -x assembler-with-cpp %s
diff --git a/clang/test/Driver/x86-mfpmath.c b/clang/test/Driver/x86-mfpmath.c
new file mode 100644
index 0000000000000..8f85cced953ab
--- /dev/null
+++ b/clang/test/Driver/x86-mfpmath.c
@@ -0,0 +1,5 @@
+// RUN: %clang -### -c --target=x86_64 -mfpmath=sse %s 2>&1 | FileCheck %s
+// CHECK: "-mfpmath" "sse"
+
+// RUN: %clang -### -c --target=x86_64 -mfpmath=sse -x assembler %s 2>&1 | FileCheck %s --check-prefix=WARN
+// WARN: warning: argument unused during compilation: '-mfpmath=sse'
diff --git a/clang/test/Driver/x86-no-gather-no-scatter.cpp b/clang/test/Driver/x86-no-gather-no-scatter.cpp
index 7efcc55787c42..63611227bd583 100644
--- a/clang/test/Driver/x86-no-gather-no-scatter.cpp
+++ b/clang/test/Driver/x86-no-gather-no-scatter.cpp
@@ -1,8 +1,8 @@
 /// Tests -mno-gather and -mno-scatter
-// RUN: %clang -c -mno-gather -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
-// RUN: %clang_cl -c /Qgather- -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -c -mno-gather -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
+// RUN: %clang_cl --target=x86_64-windows -c /Qgather- -### -- %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
 // NOGATHER: "-target-feature" "+prefer-no-gather"
 
-// RUN: %clang -c -mno-scatter -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
-// RUN: %clang_cl -c /Qscatter- -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -c -mno-scatter -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
+// RUN: %clang_cl --target=x86_64-windows -c /Qscatter- -### -- %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
 // NOSCATTER: "-target-feature" "+prefer-no-scatter"
diff --git a/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp
new file mode 100644
index 0000000000000..342da2d886668
--- /dev/null
+++ b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp
@@ -0,0 +1,11 @@
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-print-source-range-info %s 2>&1 | FileCheck %s --strict-whitespace
+
+struct DelBase {
+  constexpr DelBase() = delete;
+};
+
+// CHECK:      :{[[@LINE+1]]:21-[[@LINE+1]]:28}
+struct Foo : public DelBase {
+  constexpr Foo() {};
+};
+constexpr Foo f;
diff --git a/clang/test/Sema/empty-init.c b/clang/test/Sema/empty-init.c
new file mode 100644
index 0000000000000..8cb4a77710c2b
--- /dev/null
+++ b/clang/test/Sema/empty-init.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -std=c2x -Wall -pedantic -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -fsyntax-only -verify=c2x
+// RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -Wno-gnu-empty-initializer -fsyntax-only -verify=c2x
+// RUN: %clang_cc1 %s -std=c2x -Wgnu-empty-initializer -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c17 -Wall -pedantic -fsyntax-only -verify=c2x-ext
+// RUN: %clang_cc1 %s -std=c17 -Wgnu-empty-initializer -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c17 -Wc2x-extensions -fsyntax-only -verify=c2x-ext
+// RUN: %clang_cc1 %s -std=c17 -Wpre-c2x-compat -fsyntax-only -verify=good
+
+// good-no-diagnostics
+
+// Empty brace initialization used to be a GNU extension, but the feature was
+// added to C2x. We now treat empty initialization as a C extension rather than
+// a GNU extension. Thus, -Wgnu-empty-initializer is always silently ignored.
+
+struct S {
+  int a;
+};
+
+struct S s = {};     /* c2x-warning {{use of an empty initializer is incompatible with C standards before C2x}}
+                        c2x-ext-warning {{use of an empty initializer is a C2x extension}}
+                      */
+
+void func(void) {
+  struct S s2 = {};  /* c2x-warning {{use of an empty initializer is incompatible with C standards before C2x}}
+                        c2x-ext-warning {{use of an empty initializer is a C2x extension}}
+                      */
+  (void)s2;
+}
+
diff --git a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
index ff3e028aa314d..8ab01620b82aa 100644
--- a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
+++ b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
@@ -2,8 +2,6 @@
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=integer -ffreestanding -fsyntax-only -verify=lax-vector-integer %s
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=all -ffreestanding -fsyntax-only -verify=lax-vector-all %s
 
-// lax-vector-all-no-diagnostics
-
 // REQUIRES: riscv-registered-target
 
 #define RVV_FIXED_ATTR __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)))
@@ -20,6 +18,8 @@ typedef __rvv_uint64m1_t vuint64m1_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float64m1_t vfloat64m1_t;
 
+typedef __rvv_int64m2_t vint64m2_t;
+
 typedef vfloat32m1_t rvv_fixed_float32m1_t RVV_FIXED_ATTR;
 typedef vint32m1_t rvv_fixed_int32m1_t RVV_FIXED_ATTR;
 typedef float gnu_fixed_float32m1_t GNU_FIXED_ATTR;
@@ -76,3 +76,17 @@ void gnu_allowed_with_all_lax_conversions() {
   // lax-vector-none-error@-1 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
 }
+
+void not_allowed() {
+  rvv_fixed_int32m1_t fi32m1;
+  vint64m2_t si64m2;
+
+  fi32m1 = si64m2;
+  // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  si64m2 = fi32m1;
+  // lax-vector-none-error@-1 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+}
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
new file mode 100644
index 0000000000000..00a39f9f03b79
--- /dev/null
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s
+
+template <typename Iterator> class normal_iterator {};
+
+template <typename From, typename To> struct is_convertible {};
+
+template <typename From, typename To>
+inline constexpr bool is_convertible_v = is_convertible<From, To>::value; // expected-error {{no member named 'value' in 'is_convertible<bool, bool>'}}
+
+template <typename From, typename To>
+concept convertible_to = is_convertible_v<From, To>; // #1
+
+template <typename IteratorL, typename IteratorR>
+  requires requires(IteratorL lhs, IteratorR rhs) { // #2
+    { lhs == rhs } -> convertible_to<bool>; // #3
+  }
+constexpr bool compare(normal_iterator<IteratorL> lhs, normal_iterator<IteratorR> rhs) { // #4
+  return false;
+}
+
+class Object;
+
+void function() {
+  normal_iterator<Object *> begin, end;
+  compare(begin, end); // expected-error {{no matching function for call to 'compare'}} #5
+}
+
+// expected-note@#1 {{in instantiation of variable template specialization 'is_convertible_v<bool, bool>' requested here}}
+// expected-note@#1 {{substituting template arguments into constraint expression here}}
+// expected-note@#3 {{checking the satisfaction of concept 'convertible_to<bool, bool>'}}
+// expected-note@#2 {{substituting template arguments into constraint expression here}}
+// expected-note@#5 {{checking constraint satisfaction for template 'compare<Object *, Object *>'}}
+// expected-note@#5 {{in instantiation of function template specialization 'compare<Object *, Object *>' requested here}}
+
+// expected-note@#4 {{candidate template ignored: constraints not satisfied [with IteratorL = Object *, IteratorR = Object *]}}
+// We don't know exactly the substituted type for `lhs == rhs`, thus a placeholder 'expr-type' is emitted.
+// expected-note@#3 {{because 'convertible_to<expr-type, bool>' would be invalid}}
diff --git a/clang/test/SemaCXX/concept-fatal-error.cpp b/clang/test/SemaCXX/concept-fatal-error.cpp
index c299b39fdeb23..c606b9e21a364 100644
--- a/clang/test/SemaCXX/concept-fatal-error.cpp
+++ b/clang/test/SemaCXX/concept-fatal-error.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fsyntax-only -std=c++20 -ferror-limit 1 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ferror-limit 1 -verify %s
 
 template <class>
 concept f = requires { 42; };
@@ -6,5 +6,5 @@ struct h {
   // The missing semicolon will trigger an error and -ferror-limit=1 will make it fatal
   // We test that we do not crash in such cases (#55401)
   int i = requires { { i } f } // expected-error {{expected ';' at end of declaration list}}
-                               // expected-error@* {{too many errros emitted}}
+                               // expected-error@* {{too many errors emitted}}
 };
diff --git a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
new file mode 100644
index 0000000000000..cd096a9270937
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace baseclass_uninit {
+struct DelBase {
+  constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
+};
+
+struct Foo : DelBase {  // expected-note 2{{constructor of base class 'DelBase' is not called}}
+  constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
+};
+constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
+struct Bar : Foo {
+  constexpr Bar() {};
+};
+constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
+
+struct Base {};
+struct A : Base { // expected-note {{constructor of base class 'Base' is not called}}
+  constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
+};
+
+constexpr A a; // expected-error {{must be initialized by a constant expression}}
+
+struct B : Base { // expected-note {{constructor of base class 'Base' is not called}}
+  constexpr B() : {} // expected-error {{expected class member or base class name}}
+};
+
+constexpr B b; // expected-error {{must be initialized by a constant expression}}
+} // namespace baseclass_uninit
+
+
+struct Foo {
+  constexpr Foo(); // expected-note 2{{declared here}}
+};
+
+constexpr Foo ff; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+
+struct Bar : protected Foo {
+  int i;
+  constexpr Bar() : i(12) {} // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+};
+
+constexpr Bar bb; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{in call to 'Bar()'}}
+
+template <typename Ty>
+struct Baz {
+  constexpr Baz(); // expected-note {{declared here}}
+};
+
+struct Quux : Baz<Foo>, private Bar {
+  int i;
+  constexpr Quux() : i(12) {} // expected-note {{undefined constructor 'Baz' cannot be used in a constant expression}}
+};
+
+constexpr Quux qx; // expected-error {{must be initialized by a constant expression}} \
+                   // expected-note {{in call to 'Quux()'}}
diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
new file mode 100644
index 0000000000000..9d7f8d4100171
--- /dev/null
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+
+// https://github.com/llvm/llvm-project/issues/64605
+
+#pragma STDC FENV_ACCESS ON
+template <typename>
+int b_64605() {
+  int x;
+  if ((float)0xFFFFFFFF != (float)0x100000000) {
+    x = 1;
+  }
+  return x;
+}
+int f() { return b_64605<void>(); }
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
+
+// CHECK:      FunctionDecl {{.*}} b_64605 'int ()'
+// CHECK-NEXT: TemplateArgument type 'void'
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
diff --git a/clang/tools/clang-fuzzer/CMakeLists.txt b/clang/tools/clang-fuzzer/CMakeLists.txt
index e68ed8bbcb069..2b9720ee627cb 100644
--- a/clang/tools/clang-fuzzer/CMakeLists.txt
+++ b/clang/tools/clang-fuzzer/CMakeLists.txt
@@ -115,6 +115,9 @@ add_clang_executable(clang-fuzzer
   EXCLUDE_FROM_ALL
   ${DUMMY_MAIN}
   ClangFuzzer.cpp
+
+  DEPENDS
+  ClangDriverOptions
   )
 
 target_link_libraries(clang-fuzzer
@@ -127,6 +130,9 @@ add_clang_executable(clang-objc-fuzzer
   EXCLUDE_FROM_ALL
   ${DUMMY_MAIN}
   ClangObjectiveCFuzzer.cpp
+
+  DEPENDS
+  ClangDriverOptions
   )
 
 target_link_libraries(clang-objc-fuzzer
diff --git a/clang/tools/include-mapping/gen_std.py b/clang/tools/include-mapping/gen_std.py
index 2390ff1f2cced..57a5a6772ba89 100755
--- a/clang/tools/include-mapping/gen_std.py
+++ b/clang/tools/include-mapping/gen_std.py
@@ -242,6 +242,11 @@ def main():
             (symbol_index_root, "filesystem.html", "std::filesystem::"),
             (symbol_index_root, "pmr.html", "std::pmr::"),
             (symbol_index_root, "ranges.html", "std::ranges::"),
+
+            (symbol_index_root, "views.html", "std::ranges::views::"),
+            # std::ranges::views can be accessed as std::views.
+            (symbol_index_root, "views.html", "std::views::"),
+
             (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
             (symbol_index_root, "this_thread.html", "std::this_thread::"),
             # Zombie symbols that were available from the Standard Library, but are
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 3a1058f5e3fe9..9b81abda1d2e1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7711,6 +7711,47 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportConstructorUsingShadow) {
   CheckAST(ToTU, ToC);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase,
+       ImportFunctionDeclBitShouldNotOverwriteCtorDeclBits) {
+  Decl *From, *To;
+  std::tie(From, To) = getImportedDecl(
+      R"s(
+        struct A {
+          A() : m() {}
+          int m;
+        };
+
+        A foo() { A a; return a; }
+        A bar() { return {}; }
+      )s",
+      Lang_CXX17,
+      R"s(
+        struct A {
+          A() : m() {}
+          int m;
+        };
+        A baz() { return {}; }
+      )s",
+      Lang_CXX17, "A");
+
+  auto HasCtorInit =
+      hasAnyConstructorInitializer(cxxCtorInitializer(isMemberInitializer()));
+  auto ImpMoveCtor =
+      cxxConstructorDecl(isMoveConstructor(), isImplicit(), HasCtorInit);
+
+  auto *FromImpMoveCtor = FirstDeclMatcher<CXXConstructorDecl>().match(
+      From, ImpMoveCtor);
+  auto *ToImpMoveCtor = FirstDeclMatcher<CXXConstructorDecl>().match(
+      To, ImpMoveCtor);
+
+  EXPECT_TRUE(FromImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(FromImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+
+  EXPECT_TRUE(ToImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(ToImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+  EXPECT_TRUE(*ToImpMoveCtor->init_begin());
+}
+
 AST_MATCHER_P(UsingShadowDecl, hasIntroducerDecl, internal::Matcher<NamedDecl>,
               InnerMatcher) {
   return InnerMatcher.matches(*Node.getIntroducer(), Finder, Builder);
diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index 2ed2ed750941c..d2977b0cb55b6 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -353,6 +353,32 @@ TEST(Decl, FriendFunctionWithinClassInHeaderUnit) {
   EXPECT_TRUE(getFooValue->isInlined());
 }
 
+TEST(Decl, FunctionDeclBitsShouldNotOverlapWithCXXConstructorDeclBits) {
+  llvm::Annotations Code(R"(
+    struct A {
+      A() : m() {}
+      int m;
+    };
+
+    A f() { return A(); }
+    )");
+
+  auto AST = tooling::buildASTFromCodeWithArgs(Code.code(), {"-std=c++14"});
+  ASTContext &Ctx = AST->getASTContext();
+
+  auto HasCtorInit =
+      hasAnyConstructorInitializer(cxxCtorInitializer(isMemberInitializer()));
+  auto ImpMoveCtor =
+      cxxConstructorDecl(isMoveConstructor(), isImplicit(), HasCtorInit)
+          .bind("MoveCtor");
+
+  auto *ToImpMoveCtor =
+      selectFirst<CXXConstructorDecl>("MoveCtor", match(ImpMoveCtor, Ctx));
+
+  EXPECT_TRUE(ToImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(ToImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+}
+
 TEST(Decl, NoProtoFunctionDeclAttributes) {
   llvm::Annotations Code(R"(
     void f();
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index e512a861dc4e3..271778b5bb9e6 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -13458,6 +13458,8 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
   verifyFormat(
       "class A {\n"
       "  A() : a{} {}\n"
+      "  A() : Base<int>{} {}\n"
+      "  A() : Base<Foo<int>>{} {}\n"
       "  A(int b) : b(b) {}\n"
       "  A(int a, int b) : a(a), bs{{bs...}} { f(); }\n"
       "  int a, b;\n"
@@ -26266,6 +26268,7 @@ TEST_F(FormatTest, RemoveParentheses) {
 
   Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses;
   verifyFormat("int x __attribute__((aligned(16))) = 0;", Style);
+  verifyFormat("decltype((foo->bar)) baz;", Style);
   verifyFormat("class __declspec(dllimport) X {};",
                "class __declspec((dllimport)) X {};", Style);
   verifyFormat("int x = (({ 0; }));", "int x = ((({ 0; })));", Style);
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index 338003cd9851c..abb8e6377aabd 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -232,10 +232,20 @@ TEST(IncrementalProcessing, FindMangledNameSymbol) {
   }
 
   std::string MangledName = MangleName(FD);
-  auto Addr = cantFail(Interp->getSymbolAddress(MangledName));
-  EXPECT_NE(0U, Addr.getValue());
+  auto Addr = Interp->getSymbolAddress(MangledName);
+  EXPECT_FALSE(!Addr);
+  EXPECT_NE(0U, Addr->getValue());
   GlobalDecl GD(FD);
-  EXPECT_EQ(Addr, cantFail(Interp->getSymbolAddress(GD)));
+  EXPECT_EQ(*Addr, cantFail(Interp->getSymbolAddress(GD)));
+  cantFail(
+      Interp->ParseAndExecute("extern \"C\" int printf(const char*,...);"));
+  Addr = Interp->getSymbolAddress("printf");
+  EXPECT_FALSE(!Addr);
+
+  // FIXME: Re-enable when we investigate the way we handle dllimports on Win.
+#ifndef _WIN32
+  EXPECT_EQ((unsigned long long)&printf, Addr->getValue());
+#endif // _WIN32
 }
 
 static void *AllocateObject(TypeDecl *TD, Interpreter &Interp) {
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index b9b82564b3303..5158e99b75e5d 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -588,19 +588,34 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, uptr size) {
   return REAL(strncpy)(to, from, size);
 }
 
-INTERCEPTOR(long, strtol, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtol);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtol)(nptr, endptr, base);
-  }
+template <typename Fn>
+static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
+                                     char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
+  if (!flags()->replace_str)
+    return real(nptr, endptr, base);
   char *real_endptr;
-  long result = REAL(strtol)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
+  return res;
 }
 
+#  define INTERCEPTOR_STRTO_BASE(ret_type, func)                             \
+    INTERCEPTOR(ret_type, func, const char *nptr, char **endptr, int base) { \
+      void *ctx;                                                             \
+      ASAN_INTERCEPTOR_ENTER(ctx, func);                                     \
+      ENSURE_ASAN_INITED();                                                  \
+      return StrtolImpl(ctx, REAL(func), nptr, endptr, base);                \
+    }
+
+INTERCEPTOR_STRTO_BASE(long, strtol)
+INTERCEPTOR_STRTO_BASE(long long, strtoll)
+
+#  if SANITIZER_GLIBC
+INTERCEPTOR_STRTO_BASE(long, __isoc23_strtol)
+INTERCEPTOR_STRTO_BASE(long long, __isoc23_strtoll)
+#  endif
+
 INTERCEPTOR(int, atoi, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoi);
@@ -639,20 +654,6 @@ INTERCEPTOR(long, atol, const char *nptr) {
   return result;
 }
 
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
-INTERCEPTOR(long long, strtoll, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtoll);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtoll)(nptr, endptr, base);
-  }
-  char *real_endptr;
-  long long result = REAL(strtoll)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
-}
-
 INTERCEPTOR(long long, atoll, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoll);
@@ -666,7 +667,6 @@ INTERCEPTOR(long long, atoll, const char *nptr) {
   ASAN_READ_STRING(ctx, nptr, (real_endptr - nptr) + 1);
   return result;
 }
-#endif  // ASAN_INTERCEPT_ATOLL_AND_STRTOLL
 
 #if ASAN_INTERCEPT___CXA_ATEXIT || ASAN_INTERCEPT_ATEXIT
 static void AtCxaAtexit(void *unused) {
@@ -751,11 +751,13 @@ void InitializeAsanInterceptors() {
 
   ASAN_INTERCEPT_FUNC(atoi);
   ASAN_INTERCEPT_FUNC(atol);
-  ASAN_INTERCEPT_FUNC(strtol);
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
   ASAN_INTERCEPT_FUNC(atoll);
+  ASAN_INTERCEPT_FUNC(strtol);
   ASAN_INTERCEPT_FUNC(strtoll);
-#endif
+#  if SANITIZER_GLIBC
+  ASAN_INTERCEPT_FUNC(__isoc23_strtol);
+  ASAN_INTERCEPT_FUNC(__isoc23_strtoll);
+#  endif
 
   // Intecept jump-related functions.
   ASAN_INTERCEPT_FUNC(longjmp);
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 268096fea5e7e..d00d05587b368 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -42,12 +42,10 @@ void InitializePlatformInterceptors();
 // Use macro to describe if specific function should be
 // intercepted on a given platform.
 #if !SANITIZER_WINDOWS
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 1
 # define ASAN_INTERCEPT__LONGJMP 1
 # define ASAN_INTERCEPT_INDEX 1
 # define ASAN_INTERCEPT_PTHREAD_CREATE 1
 #else
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 0
 # define ASAN_INTERCEPT__LONGJMP 0
 # define ASAN_INTERCEPT_INDEX 0
 # define ASAN_INTERCEPT_PTHREAD_CREATE 0
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
index e3a90f18ed81a..0fa636bec0d00 100644
--- a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
@@ -65,6 +65,7 @@ INTERCEPT_WRAP_W_W(_expand_dbg)
 
 INTERCEPT_LIBRARY_FUNCTION(atoi);
 INTERCEPT_LIBRARY_FUNCTION(atol);
+INTERCEPT_LIBRARY_FUNCTION(atoll);
 INTERCEPT_LIBRARY_FUNCTION(frexp);
 INTERCEPT_LIBRARY_FUNCTION(longjmp);
 #if SANITIZER_INTERCEPT_MEMCHR
@@ -91,6 +92,7 @@ INTERCEPT_LIBRARY_FUNCTION(strspn);
 INTERCEPT_LIBRARY_FUNCTION(strstr);
 INTERCEPT_LIBRARY_FUNCTION(strtok);
 INTERCEPT_LIBRARY_FUNCTION(strtol);
+INTERCEPT_LIBRARY_FUNCTION(strtoll);
 INTERCEPT_LIBRARY_FUNCTION(wcslen);
 INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
 
diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 078d33b61be31..069f73d276f3c 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -181,7 +181,7 @@ const interpose_substitution substitution_##func_name[]             \
 // FreeBSD's dynamic linker (incompliantly) gives non-weak symbols higher
 // priority than weak ones so weak aliases won't work for indirect calls
 // in position-independent (-fPIC / -fPIE) mode.
-#   define __ASM_WEAK_WRAPPER(func)
+#   define __ASM_WEAK_WRAPPER(func) ".globl " #func "\n"
 #  else
 #   define __ASM_WEAK_WRAPPER(func) ".weak " #func "\n"
 #  endif  // SANITIZER_FREEBSD || SANITIZER_NETBSD
diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index f5e0d3cb9a673..ba92bd14d319d 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -464,6 +464,25 @@ INTERCEPTORS_STRTO_BASE(long long, wcstoll, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long, wcstoul, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long long, wcstoull, wchar_t)
 
+#if SANITIZER_GLIBC
+INTERCEPTORS_STRTO(double, __isoc23_strtod, char)
+INTERCEPTORS_STRTO(float, __isoc23_strtof, char)
+INTERCEPTORS_STRTO(long double, __isoc23_strtold, char)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_strtol, char)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_strtoll, char)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_strtoul, char)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_strtoull, char)
+INTERCEPTORS_STRTO_BASE(u64, __isoc23_strtouq, char)
+
+INTERCEPTORS_STRTO(double, __isoc23_wcstod, wchar_t)
+INTERCEPTORS_STRTO(float, __isoc23_wcstof, wchar_t)
+INTERCEPTORS_STRTO(long double, __isoc23_wcstold, wchar_t)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_wcstol, wchar_t)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_wcstoll, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_wcstoul, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_wcstoull, wchar_t)
+#endif
+
 #if SANITIZER_NETBSD
 #define INTERCEPT_STRTO(func) \
   INTERCEPT_FUNCTION(func); \
@@ -1748,6 +1767,24 @@ void InitializeInterceptors() {
   INTERCEPT_STRTO(wcstoul);
   INTERCEPT_STRTO(wcstoll);
   INTERCEPT_STRTO(wcstoull);
+#if SANITIZER_GLIBC
+  INTERCEPT_STRTO(__isoc23_strtod);
+  INTERCEPT_STRTO(__isoc23_strtof);
+  INTERCEPT_STRTO(__isoc23_strtold);
+  INTERCEPT_STRTO(__isoc23_strtol);
+  INTERCEPT_STRTO(__isoc23_strtoul);
+  INTERCEPT_STRTO(__isoc23_strtoll);
+  INTERCEPT_STRTO(__isoc23_strtoull);
+  INTERCEPT_STRTO(__isoc23_strtouq);
+  INTERCEPT_STRTO(__isoc23_wcstod);
+  INTERCEPT_STRTO(__isoc23_wcstof);
+  INTERCEPT_STRTO(__isoc23_wcstold);
+  INTERCEPT_STRTO(__isoc23_wcstol);
+  INTERCEPT_STRTO(__isoc23_wcstoul);
+  INTERCEPT_STRTO(__isoc23_wcstoll);
+  INTERCEPT_STRTO(__isoc23_wcstoull);
+#endif
+
 #ifdef SANITIZER_NLDBL_VERSION
   INTERCEPT_FUNCTION_VER(vswprintf, SANITIZER_NLDBL_VERSION);
   INTERCEPT_FUNCTION_VER(swprintf, SANITIZER_NLDBL_VERSION);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 299561b3ad3a1..0e563fa12022a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1491,6 +1491,16 @@ VSCANF_INTERCEPTOR_IMPL(__isoc99_vsscanf, false, str, format, ap)
 
 INTERCEPTOR(int, __isoc99_vfscanf, void *stream, const char *format, va_list ap)
 VSCANF_INTERCEPTOR_IMPL(__isoc99_vfscanf, false, stream, format, ap)
+
+INTERCEPTOR(int, __isoc23_vscanf, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vscanf, false, format, ap)
+
+INTERCEPTOR(int, __isoc23_vsscanf, const char *str, const char *format,
+            va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vsscanf, false, str, format, ap)
+
+INTERCEPTOR(int, __isoc23_vfscanf, void *stream, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vfscanf, false, stream, format, ap)
 #endif  // SANITIZER_INTERCEPT_ISOC99_SCANF
 
 INTERCEPTOR(int, scanf, const char *format, ...)
@@ -1511,6 +1521,15 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_fscanf, __isoc99_vfscanf, stream, format)
 
 INTERCEPTOR(int, __isoc99_sscanf, const char *str, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
+
+INTERCEPTOR(int, __isoc23_scanf, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_scanf, __isoc23_vscanf, format)
+
+INTERCEPTOR(int, __isoc23_fscanf, void *stream, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_fscanf, __isoc23_vfscanf, stream, format)
+
+INTERCEPTOR(int, __isoc23_sscanf, const char *str, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_sscanf, __isoc23_vsscanf, str, format)
 #endif
 
 #endif
@@ -1534,7 +1553,13 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
   COMMON_INTERCEPT_FUNCTION(__isoc99_fscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vsscanf); \
-  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf);
+  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_scanf);   \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_sscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_fscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vsscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vfscanf);
 #else
 #define INIT_ISOC99_SCANF
 #endif
@@ -3539,30 +3564,26 @@ UNUSED static inline void StrtolFixAndCheck(void *ctx, const char *nptr,
                                  (real_endptr - nptr) + 1 : 0);
 }
 
-
 #if SANITIZER_INTERCEPT_STRTOIMAX
-INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
+template <typename Fn>
+static ALWAYS_INLINE auto StrtoimaxImpl(void *ctx, Fn real, const char *nptr,
+                                        char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
   char *real_endptr;
-  INTMAX_T res = REAL(strtoimax)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
   return res;
 }
 
+INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(strtoimax), nptr, endptr, base);
+}
 INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strtoumax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
-  char *real_endptr;
-  UINTMAX_T res = REAL(strtoumax)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return res;
+  return StrtoimaxImpl(ctx, REAL(strtoumax), nptr, endptr, base);
 }
 
 #define INIT_STRTOIMAX                  \
@@ -3572,6 +3593,25 @@ INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
 #define INIT_STRTOIMAX
 #endif
 
+#if SANITIZER_INTERCEPT_STRTOIMAX && SANITIZER_GLIBC
+INTERCEPTOR(INTMAX_T, __isoc23_strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoimax), nptr, endptr, base);
+}
+INTERCEPTOR(UINTMAX_T, __isoc23_strtoumax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoumax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoumax), nptr, endptr, base);
+}
+
+#  define INIT_STRTOIMAX_C23                       \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoimax); \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoumax);
+#else
+#  define INIT_STRTOIMAX_C23
+#endif
+
 #if SANITIZER_INTERCEPT_MBSTOWCS
 INTERCEPTOR(SIZE_T, mbstowcs, wchar_t *dest, const char *src, SIZE_T len) {
   void *ctx;
@@ -10304,6 +10344,7 @@ static void InitializeCommonInterceptors() {
   INIT_GETCWD;
   INIT_GET_CURRENT_DIR_NAME;
   INIT_STRTOIMAX;
+  INIT_STRTOIMAX_C23;
   INIT_MBSTOWCS;
   INIT_MBSNRTOWCS;
   INIT_WCSTOMBS;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
index 220abb89c3beb..24485900644b3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
@@ -340,11 +340,19 @@ static void scanf_common(void *ctx, int n_inputs, bool allowGnuMalloc,
       size = 0;
     }
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, argp, size);
-    // For %ms/%mc, write the allocated output buffer as well.
+    // For %mc/%mC/%ms/%m[/%mS, write the allocated output buffer as well.
     if (dir.allocate) {
-      char *buf = *(char **)argp;
-      if (buf)
-        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, internal_strlen(buf) + 1);
+      if (char *buf = *(char **)argp) {
+        if (dir.convSpecifier == 'c')
+          size = 1;
+        else if (dir.convSpecifier == 'C')
+          size = sizeof(wchar_t);
+        else if (dir.convSpecifier == 'S')
+          size = (internal_wcslen((wchar_t *)buf) + 1) * sizeof(wchar_t);
+        else  // 's' or '['
+          size = internal_strlen(buf) + 1;
+        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, size);
+      }
     }
   }
 }
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
index 509e3f19fe386..819cfca44b00b 100644
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
@@ -34,6 +34,13 @@ __interceptor_pthread_setspecific w
 __interceptor_read w
 __interceptor_realpath w
 __isinf U
+__isoc23_sscanf U
+__isoc23_strtol U
+__isoc23_strtoll U
+__isoc23_strtoll_l U
+__isoc23_strtoull U
+__isoc23_strtoull_l U
+__isoc23_vsscanf U
 __isoc99_sscanf U
 __isoc99_vsscanf U
 __moddi3 U
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
index fa52ccc1994f6..de96e573ab844 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
@@ -9,14 +9,16 @@
 // Tests for *scanf interceptors implementation in sanitizer_common.
 //
 //===----------------------------------------------------------------------===//
+#include <wchar.h>
+
 #include <algorithm>
 #include <vector>
 
+#include "gtest/gtest.h"
 #include "interception/interception.h"
-#include "sanitizer_test_utils.h"
-#include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "gtest/gtest.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_test_utils.h"
 
 using namespace __sanitizer;
 
@@ -206,21 +208,35 @@ TEST(SanitizerCommonInterceptors, Scanf) {
 
 TEST(SanitizerCommonInterceptors, ScanfAllocate) {
   const char *buf = "123456";
+  const wchar_t *wbuf = L"123";
 
   // Can not use testScanf() because this case needs a valid pointer to a string
   // in the scanf argument.
+  {
+    std::vector<unsigned> scanf_sizes;
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mc", &buf);
+    verifyFormatResults("%mc", 2, scanf_sizes, {P, 1u});
+  }
+  {
+    std::vector<unsigned> scanf_sizes;
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mC", &wbuf);
+    verifyFormatResults("%mC", 2, scanf_sizes, {P, (unsigned)sizeof(wchar_t)});
+  }
   {
     std::vector<unsigned> scanf_sizes;
     testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%ms", &buf);
-    verifyFormatResults("%ms", 2, scanf_sizes,
-                        {P, (unsigned)(strlen(buf) + 1)});
+    verifyFormatResults("%ms", 2, scanf_sizes, {P, unsigned(strlen(buf) + 1)});
+    scanf_sizes.clear();
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%m[0-9]",
+               &buf);
+    verifyFormatResults("%m[0-9]", 2, scanf_sizes,
+                        {P, unsigned(strlen(buf) + 1)});
   }
-
   {
     std::vector<unsigned> scanf_sizes;
-    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mc", &buf);
-    verifyFormatResults("%mc", 2, scanf_sizes,
-                        {P, (unsigned)(strlen(buf) + 1)});
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mS", &wbuf);
+    verifyFormatResults("%mS", 2, scanf_sizes,
+                        {P, unsigned((wcslen(wbuf) + 1) * sizeof(wchar_t))});
   }
 }
 
diff --git a/compiler-rt/test/asan/TestCases/atoll_strict.c b/compiler-rt/test/asan/TestCases/atoll_strict.c
index 431ec6b4ba230..b204c97b17580 100644
--- a/compiler-rt/test/asan/TestCases/atoll_strict.c
+++ b/compiler-rt/test/asan/TestCases/atoll_strict.c
@@ -10,9 +10,6 @@
 // RUN: %env_asan_opts=strict_string_checks=false %run %t test3 2>&1
 // RUN: %env_asan_opts=strict_string_checks=true not %run %t test3 2>&1 | FileCheck %s --check-prefix=CHECK3
 
-// FIXME: Needs Windows interceptor.
-// XFAIL: target={{.*windows-(msvc.*|gnu)}}
-
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/compiler-rt/test/asan/TestCases/strtoll_strict.c b/compiler-rt/test/asan/TestCases/strtoll_strict.c
index 097412e3ab5c2..88e6651b6ed11 100644
--- a/compiler-rt/test/asan/TestCases/strtoll_strict.c
+++ b/compiler-rt/test/asan/TestCases/strtoll_strict.c
@@ -24,7 +24,7 @@
 
 // FIXME: Enable strtoll interceptor.
 // REQUIRES: shadow-scale-3
-// XFAIL: target={{.*windows-(msvc.*|gnu)}}
+// XFAIL: target={{.*windows-msvc.*}}
 
 #include <assert.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/scanf.c b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
new file mode 100644
index 0000000000000..a42d9f72a71d9
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
@@ -0,0 +1,24 @@
+// RUN: %clang -std=c17 %s -o %t && %run %t
+/// Test __isoc23_* for glibc 2.38+.
+// RUN: %clang -std=c2x %s -o %t && %run %t
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+int test_vsscanf(const char *buf, const char *fmt, ...) {
+  va_list ap;
+  va_start(ap, fmt);
+  int ret = vsscanf(buf, fmt, ap);
+  va_end(ap);
+  return ret;
+}
+
+int main(int argc, char **argv) {
+  int x, y;
+  assert(sscanf("42", "%d", &x) == 1);
+  assert(x == 42);
+  assert(test_vsscanf("42", "%d", &y) == 1);
+  assert(y == 42);
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/strtol.c b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
new file mode 100644
index 0000000000000..c3de9bcb7aa04
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
@@ -0,0 +1,61 @@
+// RUN: %clang -std=c17 %s -o %t && %run %t
+/// Test __isoc23_* for glibc 2.38+.
+// RUN: %clang -std=c2x %s -o %t && %run %t
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#define TESTL(func)                                                            \
+  {                                                                            \
+    char *end;                                                                 \
+    long l = (long)func("42", &end, 0);                                        \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+#define TESTF(func)                                                            \
+  {                                                                            \
+    char *end;                                                                 \
+    long l = (long)func("42", &end);                                           \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+#define WTESTL(func)                                                           \
+  {                                                                            \
+    wchar_t *end;                                                              \
+    long l = (long)func(L"42", &end, 0);                                       \
+    assert(l == 42);                                                           \
+    assert(*end == L'\0');                                                     \
+  }
+
+#define WTESTF(func)                                                           \
+  {                                                                            \
+    wchar_t *end;                                                              \
+    long l = (long)func(L"42", &end);                                          \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+int main() {
+  TESTL(strtol);
+  TESTL(strtoll);
+  TESTL(strtoimax);
+  TESTL(strtoul);
+  TESTL(strtoull);
+  TESTL(strtoumax);
+  TESTF(strtof);
+  TESTF(strtod);
+  TESTF(strtold);
+
+  WTESTL(wcstol);
+  WTESTL(wcstoll);
+  WTESTL(wcstoul);
+  WTESTL(wcstoull);
+  WTESTF(wcstof);
+  WTESTF(wcstod);
+  WTESTF(wcstold);
+}
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 61ba0f584ae6d..960fc62190f5f 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -128,9 +128,8 @@ static mlir::ParseResult parseAllocatableOp(FN wrapResultType,
     parser.emitError(parser.getNameLoc(), "invalid allocate type: ") << intype;
     return mlir::failure();
   }
-  result.addAttribute(
-      "operand_segment_sizes",
-      builder.getDenseI32ArrayAttr({typeparamsSize, shapeSize}));
+  result.addAttribute("operandSegmentSizes", builder.getDenseI32ArrayAttr(
+                                                 {typeparamsSize, shapeSize}));
   if (parser.parseOptionalAttrDict(result.attributes) ||
       parser.addTypeToList(restype, result.types))
     return mlir::failure();
@@ -149,7 +148,7 @@ static void printAllocatableOp(mlir::OpAsmPrinter &p, OP &op) {
     p << ", ";
     p.printOperand(sh);
   }
-  p.printOptionalAttrDict(op->getAttrs(), {"in_type", "operand_segment_sizes"});
+  p.printOptionalAttrDict(op->getAttrs(), {"in_type", "operandSegmentSizes"});
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index bf03c24fee75c..d67198d97699e 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -28,7 +28,7 @@ func.func @_QPsb1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<!
 // CHECK:    %[[ONE_2:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK: omp.parallel   {
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr<i32>
 // CHECK: omp.wsloop nowait
 // CHECK-SAME: for (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) inclusive step (%[[ONE_2]]) {
@@ -200,7 +200,7 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:    %[[ONE_2:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK: omp.parallel   {
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr<i32>
 // CHECK: omp.simdloop
 // CHECK-SAME: (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
@@ -231,13 +231,13 @@ func.func @_QPomp_target_data() {
 
 // CHECK-LABEL:   llvm.func @_QPomp_target_data() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           omp.target_enter_data   map((to -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>), (to -> %[[VAL_3]] : !llvm.ptr<array<1024 x i32>>), (always, alloc -> %[[VAL_5]] : !llvm.ptr<array<1024 x i32>>))
 // CHECK:           omp.target_exit_data   map((from -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>), (from -> %[[VAL_3]] : !llvm.ptr<array<1024 x i32>>), (release -> %[[VAL_5]] : !llvm.ptr<array<1024 x i32>>), (always, delete -> %[[VAL_7]] : !llvm.ptr<array<1024 x i32>>))
 // CHECK:           llvm.return
@@ -278,9 +278,9 @@ func.func @_QPopenmp_target_data_region() {
 
 // CHECK-LABEL:   llvm.func @_QPopenmp_target_data_region() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEi"} : (i64) -> !llvm.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEi"} : (i64) -> !llvm.ptr<i32>
 // CHECK:           omp.target_data   map((tofrom -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>)) {
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:             %[[VAL_5:.*]] = llvm.sext %[[VAL_4]] : i32 to i64
@@ -338,7 +338,7 @@ func.func @_QPomp_target() {
 
 // CHECK-LABEL:   llvm.func @_QPomp_target() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<512 x i32> {bindc_name = "a", in_type = !fir.array<512xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_targetEa"} : (i64) -> !llvm.ptr<array<512 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<512 x i32> {bindc_name = "a", in_type = !fir.array<512xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_targetEa"} : (i64) -> !llvm.ptr<array<512 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map((tofrom -> %[[VAL_1]] : !llvm.ptr<array<512 x i32>>)) {
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
@@ -544,7 +544,7 @@ func.func @_QPsb() {
 // CHECK:  llvm.func @_QPsb() {
 // CHECK:    %[[ONE:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:    %[[SIZE:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:    %[[LI_REF:.*]] = llvm.alloca %6 x i32 {bindc_name = "li", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFsbEli"} : (i64) -> !llvm.ptr<i32>
+// CHECK:    %[[LI_REF:.*]] = llvm.alloca %6 x i32 {bindc_name = "li", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFsbEli"} : (i64) -> !llvm.ptr<i32>
 // CHECK:    omp.sections   {
 // CHECK:      omp.section {
 // CHECK:        llvm.br ^[[BB_ENTRY:.*]]({{.*}})
@@ -582,7 +582,7 @@ func.func @_QPsb() {
 // CHECK:  }
 // CHECK-LABEL:  @_QPsimple_reduction
 // CHECK-SAME: %[[ARRAY_REF:.*]]: !llvm.ptr<array<100 x i32>>
-// CHECK:    %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x", in_type = !fir.logical<4>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFsimple_reductionEx"} : (i64) -> !llvm.ptr<i32>
+// CHECK:    %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x", in_type = !fir.logical<4>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFsimple_reductionEx"} : (i64) -> !llvm.ptr<i32>
 // CHECK:    omp.parallel   {
 // CHECK:      omp.wsloop   reduction(@[[EQV_REDUCTION]] -> %[[RED_ACCUMULATOR]] : !llvm.ptr<i32>) for
 // CHECK:        %[[ARRAY_ELEM_REF:.*]] = llvm.getelementptr %[[ARRAY_REF]][0, %{{.*}}] : (!llvm.ptr<array<100 x i32>>, i64) -> !llvm.ptr<i32>
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index d0c154fb0376e..52716afe3198d 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -1748,7 +1748,7 @@ func.func @no_reassoc(%arg0: !fir.ref<i32>) {
 // CHECK-LABEL: llvm.func @no_reassoc(
 // CHECK-SAME:                        %[[ARG0:.*]]: !llvm.ptr<i32>) {
 // CHECK:         %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[ALLOC:.*]] = llvm.alloca %[[C1]] x i32 {in_type = i32, operand_segment_sizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
+// CHECK:         %[[ALLOC:.*]] = llvm.alloca %[[C1]] x i32 {in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
 // CHECK:         %[[LOAD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr<i32>
 // CHECK:         llvm.store %[[LOAD]], %[[ALLOC]] : !llvm.ptr<i32>
 // CHECK:         llvm.return
@@ -1868,7 +1868,7 @@ func.func private @_QPxb(!fir.box<!fir.array<?x?xf64>>)
 // CHECK:         %[[C1_0:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:         %[[ARR_SIZE_TMP1:.*]] = llvm.mul %[[C1_0]], %[[N1]]  : i64
 // CHECK:         %[[ARR_SIZE:.*]] = llvm.mul %[[ARR_SIZE_TMP1]], %[[N2]]  : i64
-// CHECK:         %[[ARR:.*]] = llvm.alloca %[[ARR_SIZE]] x f64 {bindc_name = "arr", in_type = !fir.array<?x?xf64>, operand_segment_sizes = array<i32: 0, 2>, uniq_name = "_QFsbEarr"} : (i64) -> !llvm.ptr<f64>
+// CHECK:         %[[ARR:.*]] = llvm.alloca %[[ARR_SIZE]] x f64 {bindc_name = "arr", in_type = !fir.array<?x?xf64>, operandSegmentSizes = array<i32: 0, 2>, uniq_name = "_QFsbEarr"} : (i64) -> !llvm.ptr<f64>
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(28 : i32) : i32
 // CHECK:         %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<f64>
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
@@ -1945,9 +1945,9 @@ func.func private @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>)
 // CHECK:         %[[C10:.*]] = llvm.mlir.constant(10 : i64) : i64
 // CHECK:         %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
 // CHECK:         %[[ALLOCA_SIZE_V:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[V:.*]] = llvm.alloca %[[ALLOCA_SIZE_V]] x i32 {bindc_name = "v", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEv"} : (i64) -> !llvm.ptr<i32>
+// CHECK:         %[[V:.*]] = llvm.alloca %[[ALLOCA_SIZE_V]] x i32 {bindc_name = "v", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEv"} : (i64) -> !llvm.ptr<i32>
 // CHECK:         %[[ALLOCA_SIZE_X:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[X:.*]] = llvm.alloca %[[ALLOCA_SIZE_X]] x !llvm.array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>> {bindc_name = "x", in_type = !fir.array<20x!fir.type<_QFtest_dt_sliceTt{i:i32,j:i32}>>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEx"} : (i64) -> !llvm.ptr<array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>>>
+// CHECK:         %[[X:.*]] = llvm.alloca %[[ALLOCA_SIZE_X]] x !llvm.array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>> {bindc_name = "x", in_type = !fir.array<20x!fir.type<_QFtest_dt_sliceTt{i:i32,j:i32}>>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEx"} : (i64) -> !llvm.ptr<array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>>>
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(9 : i32) : i32
 // CHECK:         %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<i32>
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
diff --git a/libcxx/docs/Contributing.rst b/libcxx/docs/Contributing.rst
index cfae4d67ffdf4..3e3032ece99e4 100644
--- a/libcxx/docs/Contributing.rst
+++ b/libcxx/docs/Contributing.rst
@@ -49,7 +49,7 @@ sure you don't forget anything:
 
   - Did you add it to ``include/module.modulemap.in``?
   - Did you add it to ``include/CMakeLists.txt``?
-  - If it's a public header, did you update ``utils/libcxx/test/header_information.py``?
+  - If it's a public header, did you update ``utils/libcxx/header_information.py``?
 
 - Did you add the relevant feature test macro(s) for your feature? Did you update the ``generate_feature_test_macro_components.py`` script with it?
 - Did you run the ``libcxx-generate-files`` target and verify its output?
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 27ec0a295f4f4..bb62c1ce10c15 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -245,6 +245,9 @@ __handle_replacement_field(_Iterator __begin, _Iterator __end,
   using _CharT = iter_value_t<_Iterator>;
   __format::__parse_number_result __r = __format::__parse_arg_id(__begin, __end, __parse_ctx);
 
+  if (__r.__last == __end)
+    std::__throw_format_error("The argument index should end with a ':' or a '}'");
+
   bool __parse = *__r.__last == _CharT(':');
   switch (*__r.__last) {
   case _CharT(':'):
diff --git a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
index 0e2e91af7d190..5946ed698e0fd 100644
--- a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
+++ b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
 
 #include <__config>
+#include <__locale> // for locale_t
 #include <clocale>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module
index 61a926eb6307e..4d02336d30b06 100644
--- a/libcxx/include/__std_clang_module
+++ b/libcxx/include/__std_clang_module
@@ -7,6 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// WARNING, this entire header is generated by
+// utils/generate_std_clang_module_header.py
+// DO NOT MODIFY!
+
 // This header should not be directly included, it's exclusively to import all
 // of the libc++ public clang modules for the `std` clang module to export. In
 // other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
@@ -17,7 +21,6 @@
 #  error "Do not include this header directly, include individual headers instead"
 #endif
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,52 +30,187 @@
 #include <algorithm>
 #include <any>
 #include <array>
-#include <atomic>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <atomic>
+#endif
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <barrier>
+#endif
 #include <bit>
 #include <bitset>
+#include <cassert>
+#include <ccomplex>
+#include <cctype>
+#include <cerrno>
+#include <cfenv>
+#include <cfloat>
 #include <charconv>
 #include <chrono>
+#include <cinttypes>
+#include <ciso646>
+#include <climits>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <clocale>
+#endif
+#include <cmath>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <codecvt>
+#endif
 #include <compare>
+#include <complex.h>
 #include <complex>
 #include <concepts>
 #include <condition_variable>
+#include <coroutine>
+#include <csetjmp>
+#include <csignal>
+#include <cstdarg>
+#include <cstdbool>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctgmath>
+#include <ctime>
+#include <ctype.h>
+#include <cuchar>
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwchar>
+#endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwctype>
+#endif
 #include <deque>
+#include <errno.h>
 #include <exception>
 #include <execution>
 #include <expected>
+#include <experimental/deque>
+#include <experimental/forward_list>
+#include <experimental/iterator>
+#include <experimental/list>
+#include <experimental/map>
+#include <experimental/memory_resource>
+#include <experimental/propagate_const>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <experimental/regex>
+#endif
+#include <experimental/set>
+#include <experimental/simd>
+#include <experimental/string>
+#include <experimental/type_traits>
+#include <experimental/unordered_map>
+#include <experimental/unordered_set>
+#include <experimental/utility>
+#include <experimental/vector>
+#include <fenv.h>
+#include <filesystem>
+#include <float.h>
 #include <format>
 #include <forward_list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <fstream>
+#endif
 #include <functional>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <future>
+#endif
 #include <initializer_list>
+#include <inttypes.h>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iomanip>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ios>
+#endif
 #include <iosfwd>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iostream>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <istream>
+#endif
 #include <iterator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <latch>
+#endif
+#include <limits.h>
 #include <limits>
 #include <list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale.h>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale>
+#endif
 #include <map>
+#include <math.h>
 #include <mdspan>
 #include <memory>
 #include <memory_resource>
+#include <mutex>
 #include <new>
 #include <numbers>
 #include <numeric>
 #include <optional>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ostream>
+#endif
+#include <print>
 #include <queue>
 #include <random>
 #include <ranges>
 #include <ratio>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <regex>
+#endif
 #include <scoped_allocator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <semaphore>
+#endif
 #include <set>
+#include <setjmp.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <shared_mutex>
+#endif
 #include <source_location>
 #include <span>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <sstream>
+#endif
 #include <stack>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <stdatomic.h>
+#endif
+#include <stdbool.h>
+#include <stddef.h>
 #include <stdexcept>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <stop_token>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <streambuf>
+#endif
+#include <string.h>
 #include <string>
 #include <string_view>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <strstream>
+#endif
 #include <system_error>
+#include <tgmath.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <thread>
+#endif
 #include <tuple>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -80,133 +218,9 @@
 #include <variant>
 #include <vector>
 #include <version>
-
-#include <cassert>
-#include <ccomplex>
-#include <cctype>
-#include <cerrno>
-#include <cfenv>
-#include <cfloat>
-#include <cinttypes>
-#include <ciso646>
-#include <climits>
-#include <cmath>
-#include <csetjmp>
-#include <csignal>
-#include <cstdarg>
-#include <cstdbool>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctgmath>
-#include <ctime>
-#include <cuchar>
-
-#include <complex.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fenv.h>
-#include <float.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <math.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tgmath.h>
-#include <uchar.h>
-
-#ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <codecvt>
-#  ifndef _LIBCPP_HAS_NO_FILESYSTEM
-#    include <fstream>
-#  endif
-#  include <iomanip>
-#  include <ios>
-#  include <iostream>
-#  include <istream>
-#  include <locale>
-#  include <ostream>
-#  include <regex>
-#  include <sstream>
-#  include <streambuf>
-#  include <strstream>
-
-#  include <clocale>
-
-#  include <locale.h>
-#endif
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  include <cwchar>
-#  include <cwctype>
-
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
 #  include <wchar.h>
-#  include <wctype.h>
-#endif
-
-#ifdef _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
-#  include <print>
-#endif
-
-#ifndef _LIBCPP_CXX03_LANG
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <future>
-#    include <mutex>
-#    include <thread>
-#  endif
-
-#  include <experimental/deque>
-#  include <experimental/forward_list>
-#  include <experimental/iterator>
-#  include <experimental/list>
-#  include <experimental/map>
-#  include <experimental/memory_resource>
-#  include <experimental/propagate_const>
-#  ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#    include <experimental/regex>
-#  endif
-#  include <experimental/set>
-#  include <experimental/simd>
-#  include <experimental/string>
-#  include <experimental/type_traits>
-#  include <experimental/unordered_map>
-#  include <experimental/unordered_set>
-#  include <experimental/utility>
-#  include <experimental/vector>
 #endif
-
-#if _LIBCPP_STD_VER >= 14
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <shared_mutex>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 17
-#  ifndef _LIBCPP_HAS_NO_FILESYSTEM
-#    include <filesystem>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 20
-#  include <coroutine>
-
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <barrier>
-#    include <latch>
-#    include <semaphore>
-#    include <stop_token>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 23
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <stdatomic.h>
-#  endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <wctype.h>
 #endif
diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h
index d4686d89fd96e..4949062433b78 100644
--- a/libcxx/include/__type_traits/is_nothrow_constructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_constructible.h
@@ -22,7 +22,8 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__is_nothrow_constructible)
+// GCC is disabled due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611
+#if __has_builtin(__is_nothrow_constructible) && !defined(_LIBCPP_COMPILER_GCC)
 
 template < class _Tp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS is_nothrow_constructible
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index bbb7090fd4bea..37a9edcd7ece1 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -16,7 +16,6 @@ module std_atomic [system] {
   export *
 }
 module std_barrier [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "barrier"
   export *
 }
@@ -37,7 +36,6 @@ module std_chrono [system] {
   export *
 }
 module std_codecvt [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "codecvt"
   export *
 }
@@ -78,7 +76,6 @@ module std_expected [system] {
   export *
 }
 module std_filesystem [system] {
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "filesystem"
   export *
 }
@@ -91,8 +88,6 @@ module std_forward_list [system] {
   export *
 }
 module std_fstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "fstream"
   export *
 }
@@ -101,7 +96,6 @@ module std_functional [system] {
   export *
 }
 module std_future [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "future"
   export *
 }
@@ -110,12 +104,10 @@ module std_initializer_list [system] {
   export *
 }
 module std_iomanip [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iomanip"
   export *
 }
 module std_ios [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ios"
   export *
 }
@@ -124,12 +116,10 @@ module std_iosfwd [system] {
   export *
 }
 module std_iostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iostream"
   export *
 }
 module std_istream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "istream"
   export *
 }
@@ -138,7 +128,6 @@ module std_iterator [system] {
   export *
 }
 module std_latch [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "latch"
   export *
 }
@@ -151,7 +140,6 @@ module std_list [system] {
   export *
 }
 module std_locale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale"
   export *
 }
@@ -192,7 +180,6 @@ module std_optional [system] {
   export *
 }
 module std_ostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ostream"
   export *
 }
@@ -217,7 +204,6 @@ module std_ratio [system] {
   export *
 }
 module std_regex [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "regex"
   export *
 }
@@ -226,7 +212,6 @@ module std_scoped_allocator [system] {
   export *
 }
 module std_semaphore [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "semaphore"
   export *
 }
@@ -235,7 +220,6 @@ module std_set [system] {
   export *
 }
 module std_shared_mutex [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "shared_mutex"
   export std_version
 }
@@ -250,7 +234,6 @@ module std_span [system] {
   export std_private_span_span_fwd
 }
 module std_sstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "sstream"
   export *
 }
@@ -263,12 +246,10 @@ module std_stdexcept [system] {
   export *
 }
 module std_stop_token {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "stop_token"
   export *
 }
 module std_streambuf [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "streambuf"
   export *
 }
@@ -281,7 +262,6 @@ module std_string_view [system] {
   export *
 }
 module std_strstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "strstream"
   export *
 }
@@ -290,7 +270,6 @@ module std_system_error [system] {
   export *
 }
 module std_thread [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "thread"
   export *
 }
@@ -377,7 +356,6 @@ module std_climits [system] {
   export *
 }
 module std_clocale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "clocale"
   export *
 }
@@ -435,12 +413,10 @@ module std_cuchar [system] {
   export *
 }
 module std_cwchar [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwchar"
   export *
 }
 module std_cwctype [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwctype"
   export *
 }
@@ -477,7 +453,6 @@ module std_limits_h [system] {
   export *
 }
 module std_locale_h [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale.h"
   export *
 }
@@ -493,8 +468,6 @@ module std_setjmp_h [system] {
 // FIXME: <stdalign.h> is missing.
 // <stdarg.h> provided by compiler.
 module std_stdatomic_h [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
-  requires cplusplus23
   header "stdatomic.h"
   export *
 }
@@ -536,21 +509,17 @@ module std_uchar_h [system] {
 }
 // <time.h> provided by C library.
 module std_wchar_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   // <wchar.h>'s __need_* macros require textual inclusion.
   textual header "wchar.h"
   export *
 }
 module std_wctype_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "wctype.h"
   export *
 }
 
 // Experimental C++ standard library interfaces
 module std_experimental [system] {
-  requires cplusplus11
-
   module deque {
     header "experimental/deque"
     export *
@@ -657,7 +626,6 @@ module std_private_hash_table        [system] {
   export *
 }
 module std_private_locale            [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__locale"
   export *
 }
@@ -1196,7 +1164,6 @@ module std_private_chrono_duration               [system] {
 }
 module std_private_chrono_file_clock             [system] { header "__chrono/file_clock.h" }
 module std_private_chrono_formatter              [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/formatter.h"
 }
 module std_private_chrono_hh_mm_ss               [system] { header "__chrono/hh_mm_ss.h" }
@@ -1210,11 +1177,9 @@ module std_private_chrono_month                  [system] { header "__chrono/mon
 module std_private_chrono_month_weekday          [system] { header "__chrono/month_weekday.h" }
 module std_private_chrono_monthday               [system] { header "__chrono/monthday.h" }
 module std_private_chrono_ostream                [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/ostream.h"
 }
 module std_private_chrono_parser_std_format_spec [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/parser_std_format_spec.h"
 }
 module std_private_chrono_statically_widen       [system] { header "__chrono/statically_widen.h" }
@@ -1699,7 +1664,6 @@ module std_private_ranges_filter_view                [system] {
 module std_private_ranges_from_range                 [system] { header "__ranges/from_range.h" }
 module std_private_ranges_iota_view                  [system] { header "__ranges/iota_view.h" }
 module std_private_ranges_istream_view               [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__ranges/istream_view.h"
 }
 module std_private_ranges_join_view                  [system] {
diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp
index 0990d8dc181c2..f1596132024c9 100644
--- a/libcxx/src/chrono.cpp
+++ b/libcxx/src/chrono.cpp
@@ -31,7 +31,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 
diff --git a/libcxx/src/filesystem/filesystem_clock.cpp b/libcxx/src/filesystem/filesystem_clock.cpp
index d00cdc6df3437..fbb19ac68df55 100644
--- a/libcxx/src/filesystem/filesystem_clock.cpp
+++ b/libcxx/src/filesystem/filesystem_clock.cpp
@@ -29,7 +29,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
index f72c257402936..a4e1c3c29c936 100644
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
+++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
@@ -14,7 +14,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
     # Skip C compatibility headers.
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index a7b8e7b3ec549..b2f1a171507d1 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index ad18121d53be0..85055dfc703de 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index cdae8e8834e65..d75951fdf890e 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -15,7 +15,7 @@
 // UNSUPPORTED: clang-15, clang-16, clang-17
 
 // TODO: Investigate this failure on GCC 12 (in Ubuntu Jammy)
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 // RUN: %{cxx} %{flags} %s -o %t.exe %{compile_flags} -g %{link_flags}
 // Ensure locale-independence for unicode tests.
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
index f41ac27b651b3..cdbc5b34b5152 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
+from libcxx.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
 
 for header in public_headers:
   header_guard = lambda h: f"_LIBCPP_{h.upper().replace('.', '_').replace('/', '_')}"
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py
index 6a43d5dc3e4ae..47439b08fe51b 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/libcxx/libcpp_version.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   print(f"""\
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index 787317888d20d..db0678e221bef 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -21,7 +21,7 @@
 import sys
 
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import toplevel_headers
+from libcxx.header_information import toplevel_headers
 
 BLOCKLIT = (
     ""  # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/modules_include.gen.py b/libcxx/test/libcxx/modules_include.gen.py
index b6bad1b8a104d..2e9fd73421ed2 100644
--- a/libcxx/test/libcxx/modules_include.gen.py
+++ b/libcxx/test/libcxx/modules_include.gen.py
@@ -14,10 +14,11 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
+
+BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
 
 for header in public_headers:
-  BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
   print(f"""\
 //--- {header}.compile.pass.cpp
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
@@ -45,7 +46,7 @@
 #include <{header}>
 """)
 
-print(f"""
+print(f"""\
 //--- __std_clang_module.compile.pass.mm
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
@@ -68,10 +69,6 @@
 // TODO: Investigate this failure
 // UNSUPPORTED{BLOCKLIT}: LIBCXX-FREEBSD-FIXME
 
-// Lit seems to compile this twice: once with the default flags and once with with
-// the flags specified in the RUN directive. Guard the first compile from failing.
-#if __has_feature(modules)
 @import std;
-#endif
 
 """)
diff --git a/libcxx/test/libcxx/nasty_macros.gen.py b/libcxx/test/libcxx/nasty_macros.gen.py
index fdc308416f341..3c501a981d033 100644
--- a/libcxx/test/libcxx/nasty_macros.gen.py
+++ b/libcxx/test/libcxx/nasty_macros.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   print(f"""\
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py
index 45152a35f3177..a5e733d2b48a1 100644
--- a/libcxx/test/libcxx/no_assert_include.gen.py
+++ b/libcxx/test/libcxx/no_assert_include.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   if header == 'cassert':
diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
index a07260a34516f..31511064ce7ca 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
@@ -58,9 +58,15 @@ int main(int, char**) {
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0f, &DummyInt) == 0.0f);
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0, &DummyInt) == 0.0);
+//FIXME: currently linux powerpc does not support this expansion
+// since 0.0L lowers to ppcf128 and special handling is required.
+#if !defined(__LONG_DOUBLE_IBM128__)
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0L, &DummyInt) == 0.0L);
+#endif
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexpf(0.0f, &DummyInt) == 0.0f);
+#if !defined(__LONG_DOUBLE_IBM128__)
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexpl(0.0L, &DummyInt) == 0.0L);
+#endif
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::ilogb(1.0f) == 0);
   ASSERT_NOT_CONSTEXPR_CXX23(std::ilogb(1.0) == 0);
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index c446ceff7fef6..2ac5277878bee 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -20,7 +20,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 import re
 
diff --git a/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp b/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
index 1411796d65963..77c88873073c9 100644
--- a/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
+++ b/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
@@ -11,7 +11,7 @@
 // https://buildkite.com/llvm-project/libcxx-ci/builds/15823#0184fc0b-d56b-4774-9e1d-35fe24e09e37
 // It seems like the CI gcc version is buggy. I can't reproduce the failure on my system or on
 // godbolt (https://godbolt.org/z/rsPv8e8fn).
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 #include <algorithm>
 #include <cstddef>
diff --git a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
index 75f55bc420d0e..03270f25fd92b 100644
--- a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
@@ -19,6 +19,7 @@
 #include <vector>
 #include "container.h"
 #include "test_iterators.h"
+#include "test_macros.h"
 #include "test_range.h"
 
 template <class Container, class Range, class... Args>
@@ -119,6 +120,7 @@ struct Fallback {
   constexpr void push_back(value_type) {}
   constexpr value_type* begin() { return &x; }
   constexpr value_type* end() { return &x; }
+  std::size_t size() const { return 0; }
 };
 
 struct CtrDirectOrFallback : Fallback {
@@ -180,7 +182,7 @@ struct Reservable : Fallback {
     reserve_called = true;
   }
 };
-static_assert(std::ranges::__reservable_container<Reservable<>>);
+LIBCPP_STATIC_ASSERT(std::ranges::__reservable_container<Reservable<>>);
 
 constexpr void test_constraints() {
   { // Case 1 -- construct directly from the range.
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index 70ddab63f0c41..84e2c8ab1af0c 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -7,7 +7,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // TODO FMT __builtin_memcpy isn't constexpr in GCC
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index 7a9cdaab7e93e..0a5c6649240d6 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -3145,8 +3145,13 @@ void format_tests(TestFunction check, ExceptionTest check_exception) {
 
   // *** Test invalid format strings ***
   check_exception("The format string terminates at a '{'", SV("{"));
+  check_exception("The argument index value is too large for the number of arguments supplied", SV("{:"));
   check_exception("The replacement field misses a terminating '}'", SV("{:"), 42);
 
+  check_exception("The argument index should end with a ':' or a '}'", SV("{0"));
+  check_exception("The argument index value is too large for the number of arguments supplied", SV("{0:"));
+  check_exception("The replacement field misses a terminating '}'", SV("{0:"), 42);
+
   check_exception("The format string contains an invalid escape sequence", SV("}"));
   check_exception("The format string contains an invalid escape sequence", SV("{:}-}"), 42);
 
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
index 6943ddc2f968e..e16d50f18284f 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -50,6 +50,17 @@ auto test_exception =
     };
 
 int main(int, char**) {
+#if !defined(TEST_HAS_NO_EXCEPTIONS)
+  // reproducer of https://llvm.org/PR65011
+  try {
+    const char fmt[] = {'{', '0'};
+    char buf[4096];
+    [[maybe_unused]] auto ignored =
+        std::vformat_to(buf, std::string_view{fmt, fmt + sizeof(fmt)}, std::make_format_args());
+  } catch (...) {
+  }
+#endif // !defined(TEST_HAS_NO_EXCEPTIONS)
+
   format_tests<char, execution_modus::full>(test, test_exception);
 
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
index 804650fde3f3e..b96c9b11e2962 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
@@ -117,9 +117,12 @@ int main(int, char**)
 
     // Non-referencable function type
     static_assert((!std::is_convertible<ConstFunction, Function>::value), "");
+// TODO(LLVM-19): Re-enable this once we switch to GCC 14. This is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109680
+#ifndef TEST_COMPILER_GCC
     static_assert((!std::is_convertible<ConstFunction, Function*>::value), "");
     static_assert((!std::is_convertible<ConstFunction, Function&>::value), "");
     static_assert((!std::is_convertible<ConstFunction, Function&&>::value), "");
+#endif
     static_assert((!std::is_convertible<Function*, ConstFunction>::value), "");
     static_assert((!std::is_convertible<Function&, ConstFunction>::value), "");
     static_assert((!std::is_convertible<ConstFunction, ConstFunction>::value), "");
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
index 28495cfebd45c..6e420d63dbd59 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
@@ -10,6 +10,8 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D _LIBCPP_USE_IS_CONVERTIBLE_FALLBACK
 
+// UNSUPPORTED: gcc-13
+
 // type_traits
 
 // is_convertible
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
index 0d0d62432e01d..8f3c0959c622f 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
 // bitset<N>& operator|=(const bitset<N>& rhs); // constexpr since C++23
 
 #include <bitset>
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
index bf182523880d1..ef4b7fc60329d 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
 // bitset<N>& operator<<=(size_t pos); // constexpr since C++23
 
 #include <bitset>
diff --git a/libcxx/utils/CMakeLists.txt b/libcxx/utils/CMakeLists.txt
index 0338432f344a0..ce4e289290dce 100644
--- a/libcxx/utils/CMakeLists.txt
+++ b/libcxx/utils/CMakeLists.txt
@@ -2,6 +2,10 @@ add_custom_target(libcxx-generate-feature-test-macros
     COMMAND "${Python3_EXECUTABLE}" "${LIBCXX_SOURCE_DIR}/utils/generate_feature_test_macro_components.py"
     COMMENT "Generate the <version> header and tests for feature test macros.")
 
+add_custom_target(libcxx-generate-std-clang-module-header
+  COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/generate_std_clang_module_header.py"
+  COMMENT "Generate the <__std_clang_module> header")
+
 add_custom_target(libcxx-generate-extended-grapheme-cluster-tables
     COMMAND
         "${Python3_EXECUTABLE}"
@@ -38,6 +42,7 @@ add_custom_target(libcxx-generate-iwyu-mapping
 
 add_custom_target(libcxx-generate-files
     DEPENDS libcxx-generate-feature-test-macros
+            libcxx-generate-std-clang-module-header
             libcxx-generate-extended-grapheme-cluster-tables
             libcxx-generate-extended-grapheme-cluster-tests
             libcxx-generate-escaped-output-table
diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py
new file mode 100644
index 0000000000000..afdc9f653c2a2
--- /dev/null
+++ b/libcxx/utils/generate_std_clang_module_header.py
@@ -0,0 +1,64 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+import operator
+import os.path
+
+import libcxx.header_information
+
+header_restrictions = libcxx.header_information.header_restrictions
+
+libcxx_include_directory = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include"
+)
+with open(
+    os.path.join(libcxx_include_directory, "__std_clang_module"), "w"
+) as std_clang_module_header:
+    std_clang_module_header.write(
+        """\
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// WARNING, this entire header is generated by
+// utils/generate_std_clang_module_header.py
+// DO NOT MODIFY!
+
+// This header should not be directly included, it's exclusively to import all
+// of the libc++ public clang modules for the `std` clang module to export. In
+// other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
+// in Swift to expose all of the libc++ interfaces. This is generally not
+// recommended, however there are some clients that need to import all of libc++
+// without knowing what "all" is.
+#if !__building_module(std)
+#  error "Do not include this header directly, include individual headers instead"
+#endif
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+"""
+    )
+    # Include the angle brackets in sorting so that <a.h> sorts before <a>
+    # like check-format wants.
+    for include, header in sorted([(f"<{header}>", header) for header in libcxx.header_information.public_headers]):
+        header_restriction = header_restrictions.get(header)
+        if header_restriction:
+            std_clang_module_header.write(f"#if {header_restriction}\n")
+            std_clang_module_header.write(f"#  include {include}\n")
+            std_clang_module_header.write(f"#endif\n")
+        else:
+            std_clang_module_header.write(f"#include {include}\n")
diff --git a/libcxx/utils/libcxx/test/header_information.py b/libcxx/utils/libcxx/header_information.py
similarity index 78%
rename from libcxx/utils/libcxx/test/header_information.py
rename to libcxx/utils/libcxx/header_information.py
index 9ca0e9548c724..169638d5efc11 100644
--- a/libcxx/utils/libcxx/test/header_information.py
+++ b/libcxx/utils/libcxx/header_information.py
@@ -8,6 +8,46 @@
 
 import os, pathlib
 
+header_restrictions = {
+    # headers with #error directives
+    "atomic": "!defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)",
+    "stdatomic.h": "!defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)",
+
+    # headers with #error directives
+    "ios": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "locale.h": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    # transitive includers of the above headers
+    "clocale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "codecvt": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "experimental/regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "fstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "iomanip": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "iostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "istream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "locale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "ostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "sstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "streambuf": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "strstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+
+    # headers with #error directives
+    "barrier": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "future": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "latch": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "stop_token": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "thread": "!defined(_LIBCPP_HAS_NO_THREADS)",
+
+    # headers with #error directives
+    "wchar.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    "wctype.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    # transitive includers of the above headers
+    "cwchar": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    "cwctype": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+}
+
 lit_header_restrictions = {
     "barrier": "// UNSUPPORTED: no-threads, c++03, c++11, c++14, c++17",
     "clocale": "// UNSUPPORTED: no-localization",
@@ -136,7 +176,7 @@ def is_header(file):
         and file.name != "libcxx.imp"
     )
 
-libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
+libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 include = pathlib.Path(os.path.join(libcxx_root, "include"))
 test = pathlib.Path(os.path.join(libcxx_root, "test"))
 assert libcxx_root.exists()
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index f43c634a1644f..f1401d7afc635 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -55,6 +55,14 @@
     # Don't fail compilation in case the compiler fails to perform the requested
     # loop vectorization.
     "-Wno-pass-failed",
+
+    # TODO: Find out why GCC warns in lots of places (is this a problem with always_inline?)
+    "-Wno-dangling-reference",
+    "-Wno-mismatched-new-delete",
+    "-Wno-redundant-move",
+
+    # This doesn't make sense in real code, but we have to test it because the standard requires us to not break
+    "-Wno-self-move",
 ]
 
 _allStandards = ["c++03", "c++11", "c++14", "c++17", "c++20", "c++23", "c++26"]
diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
index 3236f9aae1de1..667447db1e68a 100644
--- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
+++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
@@ -15,7 +15,7 @@
 
 // GCC supports noexcept function types but this test still fails.
 // This is likely a bug in their implementation. Investigation needed.
-// XFAIL: gcc-11, gcc-12
+// XFAIL: gcc-11, gcc-12, gcc-13
 
 #include <cassert>
 
diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index 87942c1e92452..3d21edb3453a1 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -471,10 +471,14 @@ void PPC::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
     if (insn >> 26 != 31)
       error("unrecognized instruction for IE to LE R_PPC_TLS");
     // addi rT, rT, x@tls --> addi rT, rT, x@tprel@l
-    uint32_t dFormOp = getPPCDFormOp((read32(loc) & 0x000007fe) >> 1);
-    if (dFormOp == 0)
-      error("unrecognized instruction for IE to LE R_PPC_TLS");
-    write32(loc, (dFormOp << 26) | (insn & 0x03ff0000) | lo(val));
+    unsigned secondaryOp = (read32(loc) & 0x000007fe) >> 1;
+    uint32_t dFormOp = getPPCDFormOp(secondaryOp);
+    if (dFormOp == 0) { // Expecting a DS-Form instruction.
+      dFormOp = getPPCDSFormOp(secondaryOp);
+      if (dFormOp == 0)
+        error("unrecognized instruction for IE to LE R_PPC_TLS");
+    }
+    write32(loc, (dFormOp | (insn & 0x03ff0000) | lo(val)));
     break;
   }
   default:
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 36b1d0e3c9be4..0b6459f852c0b 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -37,6 +37,12 @@ enum XFormOpcd {
   STHX = 407,
   STWX = 151,
   STDX = 149,
+  LHAX = 343,
+  LWAX = 341,
+  LFSX = 535,
+  LFDX = 599,
+  STFSX = 663,
+  STFDX = 727,
   ADD = 266,
 };
 
@@ -49,7 +55,6 @@ enum DFormOpcd {
   LWZ = 32,
   LWZU = 33,
   LFSU = 49,
-  LD = 58,
   LFDU = 51,
   STB = 38,
   STBU = 39,
@@ -59,10 +64,20 @@ enum DFormOpcd {
   STWU = 37,
   STFSU = 53,
   STFDU = 55,
-  STD = 62,
+  LHA = 42,
+  LFS = 48,
+  LFD = 50,
+  STFS = 52,
+  STFD = 54,
   ADDI = 14
 };
 
+enum DSFormOpcd {
+  LD = 58,
+  LWA = 58,
+  STD = 62
+};
+
 constexpr uint32_t NOP = 0x60000000;
 
 enum class PPCLegacyInsn : uint32_t {
@@ -825,26 +840,48 @@ void PPC64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
   }
 }
 
+// Map X-Form instructions to their DS-Form counterparts, if applicable.
+// The full encoding is returned here to distinguish between the different
+// DS-Form instructions.
+unsigned elf::getPPCDSFormOp(unsigned secondaryOp) {
+  switch (secondaryOp) {
+  case LWAX:
+    return (LWA << 26) | 0x2;
+  case LDX:
+    return LD << 26;
+  case STDX:
+    return STD << 26;
+  default:
+    return 0;
+  }
+}
+
 unsigned elf::getPPCDFormOp(unsigned secondaryOp) {
   switch (secondaryOp) {
   case LBZX:
-    return LBZ;
+    return LBZ << 26;
   case LHZX:
-    return LHZ;
+    return LHZ << 26;
   case LWZX:
-    return LWZ;
-  case LDX:
-    return LD;
+    return LWZ << 26;
   case STBX:
-    return STB;
+    return STB << 26;
   case STHX:
-    return STH;
+    return STH << 26;
   case STWX:
-    return STW;
-  case STDX:
-    return STD;
+    return STW << 26;
+  case LHAX:
+    return LHA << 26;
+  case LFSX:
+    return LFS << 26;
+  case LFDX:
+    return LFD << 26;
+  case STFSX:
+    return STFS << 26;
+  case STFDX:
+    return STFD << 26;
   case ADD:
-    return ADDI;
+    return ADDI << 26;
   default:
     return 0;
   }
@@ -898,10 +935,16 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         error("unrecognized instruction for IE to LE R_PPC64_TLS");
       uint32_t secondaryOp = (read32(loc) & 0x000007FE) >> 1; // bits 21-30
       uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-      if (dFormOp == 0)
-        error("unrecognized instruction for IE to LE R_PPC64_TLS");
-      write32(loc, ((dFormOp << 26) | (read32(loc) & 0x03FFFFFF)));
-      relocateNoSym(loc + offset, R_PPC64_TPREL16_LO, val);
+      uint32_t finalReloc;
+      if (dFormOp == 0) { // Expecting a DS-Form instruction.
+        dFormOp = getPPCDSFormOp(secondaryOp);
+        if (dFormOp == 0)
+          error("unrecognized instruction for IE to LE R_PPC64_TLS");
+        finalReloc = R_PPC64_TPREL16_LO_DS;
+      } else
+        finalReloc = R_PPC64_TPREL16_LO;
+      write32(loc, dFormOp | (read32(loc) & 0x03ff0000));
+      relocateNoSym(loc + offset, finalReloc, val);
     } else if (locAsInt % 4 == 1) {
       // If the offset is not 4 byte aligned then we have a PCRel type reloc.
       // This version of the relocation is offset by one byte from the
@@ -926,9 +969,12 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         }
       } else {
         uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-        if (dFormOp == 0)
-          errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
-        write32(loc - 1, ((dFormOp << 26) | (tlsInstr & 0x03FF0000)));
+        if (dFormOp == 0) { // Expecting a DS-Form instruction.
+          dFormOp = getPPCDSFormOp(secondaryOp);
+          if (dFormOp == 0)
+            errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
+        }
+        write32(loc - 1, (dFormOp | (tlsInstr & 0x03ff0000)));
       }
     } else {
       errorOrWarn("R_PPC64_TLS must be either 4 byte aligned or one byte "
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 9d4f22dd93f1b..47dbe6b4d1c65 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -207,6 +207,7 @@ void processArmCmseSymbols();
 void writePPC32GlinkSection(uint8_t *buf, size_t numEntries);
 
 unsigned getPPCDFormOp(unsigned secondaryOp);
+unsigned getPPCDSFormOp(unsigned secondaryOp);
 
 // In the PowerPC64 Elf V2 abi a function can have 2 entry points.  The first
 // is a global entry point (GEP) which typically is used to initialize the TOC
diff --git a/lld/test/ELF/ppc32-tls-ie.s b/lld/test/ELF/ppc32-tls-ie.s
index f9f46452484a6..84a105c8626b3 100644
--- a/lld/test/ELF/ppc32-tls-ie.s
+++ b/lld/test/ELF/ppc32-tls-ie.s
@@ -12,8 +12,8 @@
 # IE-REL:      FLAGS STATIC_TLS
 ## A non-preemptable symbol (b) has 0 st_shndx.
 # IE-REL:      .rela.dyn {
-# IE-REL-NEXT:   0x20238 R_PPC_TPREL32 - 0xC
-# IE-REL-NEXT:   0x20234 R_PPC_TPREL32 a 0x0
+# IE-REL-NEXT:   0x20258 R_PPC_TPREL32 - 0xC
+# IE-REL-NEXT:   0x20254 R_PPC_TPREL32 a 0x0
 # IE-REL-NEXT: }
 
 ## &.got[3] - _GLOBAL_OFFSET_TABLE_ = 12
@@ -44,6 +44,12 @@ lbzx 10, 8, c@tls
 # IE-NEXT: stbx 14, 4, 2
 # IE-NEXT: sthx 15, 5, 2
 # IE-NEXT: stwx 16, 6, 2
+# IE-NEXT: lhax 17, 7, 2
+# IE-NEXT: lwax 18, 8, 2
+# IE-NEXT: lfsx 19, 9, 2
+# IE-NEXT: lfdx 20, 10, 2
+# IE-NEXT: stfsx 21, 11, 2
+# IE-NEXT: stfdx 22, 12, 2
 
 ## In LE, these X-Form instructions are changed to their corresponding D-Form.
 # LE-NEXT: lhz 12, -28660(2)
@@ -51,12 +57,26 @@ lbzx 10, 8, c@tls
 # LE-NEXT: stb 14, -28660(4)
 # LE-NEXT: sth 15, -28660(5)
 # LE-NEXT: stw 16, -28660(6)
+# LE-NEXT: lha 17, -28660(7)
+# LE-NEXT: lwa 18, -28660(8)
+# LE-NEXT: lfs 19, -28660(9)
+# LE-NEXT: lfd 20, -28660(10)
+# LE-NEXT: stfs 21, -28660(11)
+# LE-NEXT: stfd 22, -28660(12)
 
 lhzx 12, 2, s@tls
 lwzx 13, 3, i@tls
 stbx 14, 4, c@tls
 sthx 15, 5, s@tls
 stwx 16, 6, i@tls
+lhax 17, 7, s@tls
+lwax 18, 8, i@tls
+lfsx 19, 9, f@tls
+lfdx 20, 10, d@tls
+stfsx 21, 11, f@tls
+stfdx 22, 12, d@tls
+ldx 23, 13, l@tls
+stdx 24, 14, l@tls
 
 .section .tbss
 .globl a
@@ -66,3 +86,6 @@ a:
 c:
 s:
 i:
+f:
+d:
+l:
diff --git a/lld/test/ELF/ppc64-tls-ie.s b/lld/test/ELF/ppc64-tls-ie.s
index 8da808b86c30b..8855e8c012399 100644
--- a/lld/test/ELF/ppc64-tls-ie.s
+++ b/lld/test/ELF/ppc64-tls-ie.s
@@ -24,10 +24,12 @@
 
 # IE-REL:      FLAGS STATIC_TLS
 # IE-REL:      .rela.dyn {
-# IE-REL-NEXT:   0x204C8 R_PPC64_TPREL64 c 0x0
-# IE-REL-NEXT:   0x204D0 R_PPC64_TPREL64 s 0x0
-# IE-REL-NEXT:   0x204D8 R_PPC64_TPREL64 i 0x0
-# IE-REL-NEXT:   0x204E0 R_PPC64_TPREL64 l 0x0
+# IE-REL-NEXT:   0x205A8 R_PPC64_TPREL64 c 0x0
+# IE-REL-NEXT:   0x205B0 R_PPC64_TPREL64 s 0x0
+# IE-REL-NEXT:   0x205B8 R_PPC64_TPREL64 i 0x0
+# IE-REL-NEXT:   0x205C0 R_PPC64_TPREL64 l 0x0
+# IE-REL-NEXT:   0x205C8 R_PPC64_TPREL64 f 0x0
+# IE-REL-NEXT:   0x205D0 R_PPC64_TPREL64 d 0x0
 # IE-REL-NEXT: }
 
 # INPUT-REL: R_PPC64_GOT_TPREL16_HA c 0x0
@@ -152,10 +154,64 @@ test_ds:
   ld 4, l@got@tprel(2)
   stdx 3, 4, l@tls
 
+# LE-LABEL: <test_lhax>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lha 3, -28670(3)
+test_lhax:
+  addis 3, 2, s@got@tprel@ha
+  ld 3, s@got@tprel@l(3)
+  lhax 3, 3, s@tls
+
+# LE-LABEL: <test_lwax>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lwa 3, -28668(3)
+test_lwax:
+  addis 3, 2, i@got@tprel@ha
+  ld 3, i@got@tprel@l(3)
+  lwax 3, 3, i@tls
+
+# LE-LABEL: <test_lfsx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lfs 3, -28656(3)
+test_lfsx:
+  addis 3, 2, f@got@tprel@ha
+  ld 3, f@got@tprel@l(3)
+  lfsx 3, 3, f@tls
+
+# LE-LABEL: <test_lfdx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lfd 3, -28648(3)
+test_lfdx:
+  addis 3, 2, d@got@tprel@ha
+  ld 3, d@got@tprel@l(3)
+  lfdx 3, 3, d@tls
+
+# LE-LABEL: <test_stfsx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 4, 13, 0
+# LE-NEXT:    stfs 3, -28656(4)
+test_stfsx:
+  addis 4, 2, f@got@tprel@ha
+  ld 4, f@got@tprel@l(4)
+  stfsx 3, 4, f@tls
+
+# LE-LABEL: <test_stfdx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 4, 13, 0
+# LE-NEXT:    stfd 3, -28648(4)
+test_stfdx:
+  addis 4, 2, d@got@tprel@ha
+  ld 4, d@got@tprel@l(4)
+  stfdx 3, 4, d@tls
+
 # NOREL: There are no relocations in this file.
 
 .section .tdata,"awT",@progbits
-.globl c, s, i, l
+.globl c, s, i, l, f, d
 c:
 .byte 97
 
@@ -170,3 +226,9 @@ i:
 .p2align 3
 l:
 .quad 55
+f:
+.long 55
+
+.p2align 3
+d:
+.quad 55
diff --git a/lld/test/ELF/ppc64-tls-pcrel-ie.s b/lld/test/ELF/ppc64-tls-pcrel-ie.s
index f7a828dc41744..38c081f966469 100644
--- a/lld/test/ELF/ppc64-tls-pcrel-ie.s
+++ b/lld/test/ELF/ppc64-tls-pcrel-ie.s
@@ -29,6 +29,12 @@ SECTIONS {
   .text_val 0x1002000 : { *(.text_val) }
   .text_twoval 0x1003000 : { *(.text_twoval) }
   .text_incrval 0x1004000 : { *(.text_incrval) }
+  .text_incrval_half 0x1005000 : { *(.text_incrval_half) }
+  .text_incrval_word 0x1006000 : { *(.text_incrval_word) }
+  .text_incrval_float 0x1007000 : { *(.text_incrval_float) }
+  .text_incrval_double 0x1008000 : { *(.text_incrval_double) }
+  .text_incrval_dword 0x1009000 : { *(.text_incrval_dword) }
+  .text_incrval_half_zero 0x1010000 : { *(.text_incrval_half_zero) }
 }
 
 #--- defs
@@ -42,26 +48,26 @@ y:
 
 #--- asm
 # IE-RELOC: Relocation section '.rela.dyn' at offset 0x10090 contains 2 entries:
-# IE-RELOC: 00000000010040f0  0000000100000049 R_PPC64_TPREL64        0000000000000000 x + 0
-# IE-RELOC: 00000000010040f8  0000000200000049 R_PPC64_TPREL64        0000000000000000 y + 0
+# IE-RELOC: 00000000010100f0  0000000100000049 R_PPC64_TPREL64        0000000000000000 x + 0
+# IE-RELOC-NEXT: 00000000010100f8  0000000200000049 R_PPC64_TPREL64        0000000000000000 y + 0
 
 # IE-SYM:   Symbol table '.dynsym' contains 3 entries:
 # IE-SYM:   1: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND x
 # IE-SYM:   2: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND y
 
 # IE-GOT:      Hex dump of section '.got':
-# IE-GOT-NEXT: 0x010040e8 e8c00001 00000000 00000000 00000000
+# IE-GOT-NEXT: 0x010100e8 e8800101 00000000 00000000 00000000
 
 # LE-RELOC: There are no relocations in this file.
 
-# LE-SYM: Symbol table '.symtab' contains 8 entries:
-# LE-SYM: 6: 0000000000000000     0 TLS     GLOBAL DEFAULT     6 x
-# LE-SYM: 7: 0000000000000004     0 TLS     GLOBAL DEFAULT     6 y
+# LE-SYM: Symbol table '.symtab' contains 14 entries:
+# LE-SYM: 0000000000000000     0 TLS     GLOBAL DEFAULT     [[#]] x
+# LE-SYM: 0000000000000004     0 TLS     GLOBAL DEFAULT     [[#]] y
 
 # LE-GOT: could not find section '.got'
 
 # IE-LABEL: <IEAddr>:
-# IE-NEXT:    pld 3, 12528(0), 1
+# IE-NEXT:    pld 3, 61680(0), 1
 # IE-NEXT:    add 3, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEAddr>:
@@ -75,7 +81,7 @@ IEAddr:
 	blr
 
 # IE-LABEL: <IEAddrCopy>:
-# IE-NEXT:    pld 3, 12512(0), 1
+# IE-NEXT:    pld 3, 61664(0), 1
 # IE-NEXT:    add 4, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEAddrCopy>:
@@ -89,7 +95,7 @@ IEAddrCopy:
 	blr
 
 # IE-LABEL: <IEVal>:
-# IE-NEXT:    pld 3, 8432(0), 1
+# IE-NEXT:    pld 3, 57584(0), 1
 # IE-NEXT:    lwzx 3, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEVal>:
@@ -103,8 +109,8 @@ IEVal:
 	blr
 
 # IE-LABEL: <IETwoVal>:
-# IE-NEXT:    pld 3, 4336(0), 1
-# IE-NEXT:    pld 4, 4336(0), 1
+# IE-NEXT:    pld 3, 53488(0), 1
+# IE-NEXT:    pld 4, 53488(0), 1
 # IE-NEXT:    lwzx 3, 3, 13
 # IE-NEXT:    lwzx 4, 4, 13
 # IE-NEXT:    blr
@@ -123,7 +129,7 @@ IETwoVal:
 	blr
 
 # IE-LABEL: <IEIncrementVal>:
-# IE-NEXT:    pld 4, 248(0), 1
+# IE-NEXT:    pld 4, 49400(0), 1
 # IE-NEXT:    lwzx 3, 4, 13
 # IE-NEXT:    stwx 3, 4, 13
 # IE-NEXT:    blr
@@ -138,3 +144,105 @@ IEIncrementVal:
 	lwzx 3, 4, y@tls@pcrel
 	stwx 3, 4, y@tls@pcrel
 	blr
+
+# IE-LABEL: <IEIncrementValHalf>:
+# IE-NEXT:    pld 4, 45304(0), 1
+# IE-NEXT:    lhax 3, 4, 13
+# IE-NEXT:    sthx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValHalf>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lha 3, 0(4)
+# LE-NEXT:    sth 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_half, "ax", %progbits
+IEIncrementValHalf:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lhax 3, 4, y@tls@pcrel
+	sthx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValWord>:
+# IE-NEXT:    pld 4, 41208(0), 1
+# IE-NEXT:    lwax 3, 4, 13
+# IE-NEXT:    stwx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValWord>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lwa 3, 0(4)
+# LE-NEXT:    stw 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_word, "ax", %progbits
+IEIncrementValWord:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lwax 3, 4, y@tls@pcrel
+	stwx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValFloat>:
+# IE-NEXT:    pld 4, 37112(0), 1
+# IE-NEXT:    lfsx 3, 4, 13
+# IE-NEXT:    stfsx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValFloat>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lfs 3, 0(4)
+# LE-NEXT:    stfs 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_float, "ax", %progbits
+IEIncrementValFloat:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lfsx 3, 4, y@tls@pcrel
+	stfsx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValDouble>:
+# IE-NEXT:    pld 4, 33016(0), 1
+# IE-NEXT:    lfdx 3, 4, 13
+# IE-NEXT:    stfdx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValDouble>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lfd 3, 0(4)
+# LE-NEXT:    stfd 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_double, "ax", %progbits
+IEIncrementValDouble:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lfdx 3, 4, y@tls@pcrel
+	stfdx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValDword>:
+# IE-NEXT:    pld 4, 28920(0), 1
+# IE-NEXT:    ldx 3, 4, 13
+# IE-NEXT:    stdx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValDword>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    ld 3, 0(4)
+# LE-NEXT:    std 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_dword, "ax", %progbits
+IEIncrementValDword:
+	pld 4, y@got@tprel@pcrel(0), 1
+	ldx 3, 4, y@tls@pcrel
+	stdx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValHalfZero>:
+# IE-NEXT:    pld 4, 248(0), 1
+# IE-NEXT:    lhzx 3, 4, 13
+# IE-NEXT:    sthx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValHalfZero>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lhz 3, 0(4)
+# LE-NEXT:    sth 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_half_zero, "ax", %progbits
+IEIncrementValHalfZero:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lhzx 3, 4, y@tls@pcrel
+	sthx 3, 4, y@tls@pcrel
+	blr
diff --git a/lldb/source/Host/windows/FileSystem.cpp b/lldb/source/Host/windows/FileSystem.cpp
index b919d9bcd9dd4..4b0cd74b8013b 100644
--- a/lldb/source/Host/windows/FileSystem.cpp
+++ b/lldb/source/Host/windows/FileSystem.cpp
@@ -101,6 +101,8 @@ int FileSystem::Open(const char *path, int flags, int mode) {
   std::wstring wpath;
   if (!llvm::ConvertUTF8toWide(path, wpath))
     return -1;
+  // All other bits are rejected by _wsopen_s
+  mode = mode & (_S_IREAD | _S_IWRITE);
   int result;
   ::_wsopen_s(&result, wpath.c_str(), flags, _SH_DENYNO, mode);
   return result;
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index bd12d8c3964f6..0cb7a6266f1ab 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -230,19 +230,57 @@ Changes to the MIPS Backend
 Changes to the PowerPC Backend
 ------------------------------
 
+* Improved code sequence of materializing 64-bit immediate numbers, expanding
+  ``is.fpclass`` intrinsic and forwarding stores.
+* Implemented DFP instructions (for use via inline asm).
+* Improved code gen for vector add.
+* Added ability to show statistics of number of entries in the TOC.
+* Added Binary Coded Decimal Assist instructions (for use via inline asm).
+* Added basic support for vector functions in GlobalISel.
+* Added additional X-Form load and store instruction generation for TLS accesses.
+* PPC64LE backend is added to JITLink.
+* Added various bug fixes and optimizations.
+* Added function pointer alignment to the DataLayout for Power, which lets us
+  make more informed choices about what this alignment defaults to for various 
+  purposes (e.g., C++ pointers to member). If the target ABI uses function
+  descriptor objects, this is the alignment we would emit the descriptor with.
+  Otherwise, a function pointer points to a global entry point, so this is at
+  least the alignment for code on Power (i.e., 4-bytes).
+
+AIX Support/improvements:
+
+
 * A new option ``-mxcoff-roptr`` is added to ``clang`` and ``llc``. When this
   option is present, constant objects with relocatable address values are put
-  into the RO data section. This option should be used with the ``-fdata-sections``
-  option, and is not supported with ``-fno-data-sections``. The option is
-  only supported on AIX.
-* On AIX, teach the profile runtime to check for a build-id string; such string
-  can be created by the -mxcoff-build-id option.
+  into the RO data section. This option should be used with the
+  ``-fdata-sections`` option, and is not supported with ``-fno-data-sections``.
+
+* Taught the profile runtime to check for a build-id string. Build-id strings
+  can be created via the ``-mxcoff-build-id`` option.
+
 * Removed ``-ppc-quadword-atomics`` which only affected lock-free quadword
   atomics on AIX. Now backend generates lock-free quadword atomics code on AIX
   by default. To support lock-free quadword atomics in libatomic, the OS level
   must be at least AIX 7.2 TL5 SP3 with libc++.rte of version 17.1.1 or above
   installed.
 
+* Integrated assembler is enabled by default on AIX.
+* System assembler is always used to compile assembly files on AIX.
+* Added support for local-exec TLS.
+* Added a new option, ``--traceback-table``, to ``llvm-objdump`` to print out
+  the traceback table information for XCOFF object files.
+* Added ``llvm-ar`` object mode options ``-X32``, ``-X64``, ``-X32-64``,
+  and ``-Xany``.
+* Changed the default name of the text-section csect to be an empty string
+  instead of ``.text``. This change does not affect the behaviour
+  of the program.
+* Fixed a problem when the personality routine for the legacy AIX ``xlclang++``
+  compiler uses the stack slot to pass the exception object to the landing pad.
+  Runtime routine ``__xlc_exception_handle()`` invoked by the landing pad to
+  retrieve the exception object now skips frames not associated with functions
+  that are C++ EH-aware because the compiler sometimes generates a wrapper of
+  ``__xlc_exception_handle()`` for optimization purposes.
+
 Changes to the RISC-V Backend
 -----------------------------
 
diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index b109b7f7e65ae..7b2bfdac75a8f 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -115,6 +115,9 @@ class LazyValueInfo {
   /// PredBB to OldSucc to be from PredBB to NewSucc instead.
   void threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc);
 
+  /// Remove information related to this value from the cache.
+  void forgetValue(Value *V);
+
   /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
 
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index 310c8900af9ef..c57bd2350af14 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -299,6 +299,7 @@ class ArgList {
   /// \p Default if neither option is given. If both the option and its
   /// negation are present, the last one wins.
   bool hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
+  bool hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
 
   /// hasFlag - Given an option \p Pos, an alias \p PosAlias and its negative
   /// form \p Neg, return true if the option or its alias is present, false if
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 33651783cb177..2ba6036056d99 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -465,6 +465,10 @@ class LazyValueInfoImpl {
     F.print(OS, &Writer);
   }
 
+  /// This is part of the update interface to remove information related to this
+  /// value from the cache.
+  void forgetValue(Value *V) { TheCache.eraseValue(V); }
+
   /// This is part of the update interface to inform the cache
   /// that a block has been deleted.
   void eraseBlock(BasicBlock *BB) {
@@ -1969,6 +1973,11 @@ void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   }
 }
 
+void LazyValueInfo::forgetValue(Value *V) {
+  if (PImpl)
+    getImpl(PImpl, AC, nullptr).forgetValue(V);
+}
+
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
   if (PImpl) {
     getImpl(PImpl, AC, BB->getModule()).eraseBlock(BB);
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 4e80e9b58c060..523e077fd9a28 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -538,6 +538,10 @@ void MachineLICMBase::HoistRegionPostRA() {
         PhysRegDefs.set(*AI);
     }
 
+    // Funclet entry blocks will clobber all registers
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
+      PhysRegClobbers.setBitsNotInMask(Mask);
+
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
       ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 400bedabc0037..86f28e578e5d9 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -75,6 +75,13 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
   return Default;
 }
 
+bool ArgList::hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg,
+                             bool Default) const {
+  if (Arg *A = getLastArgNoClaim(Pos, Neg))
+    return A->getOption().matches(Pos);
+  return Default;
+}
+
 bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
                       bool Default) const {
   if (Arg *A = getLastArg(Pos, PosAlias, Neg))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0605dfa637939..c7a6dd7deb45b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13840,7 +13840,17 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4:
   case Intrinsic::aarch64_neon_ld1x2:
   case Intrinsic::aarch64_neon_ld1x3:
-  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld1x4: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile loads with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::aarch64_neon_ld2lane:
   case Intrinsic::aarch64_neon_ld3lane:
   case Intrinsic::aarch64_neon_ld4lane:
@@ -13848,9 +13858,13 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld3r:
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    // ldx return struct with the same vec type
+    Type *RetTy = I.getType();
+    auto *StructTy = cast<StructType>(RetTy);
+    unsigned NumElts = StructTy->getNumElements();
+    Type *VecTy = StructTy->getElementType(0);
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
@@ -13863,20 +13877,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_st4:
   case Intrinsic::aarch64_neon_st1x2:
   case Intrinsic::aarch64_neon_st1x3:
-  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st1x4: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    unsigned NumElts = 0;
+    for (const Value *Arg : I.args()) {
+      Type *ArgTy = Arg->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile stores with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::aarch64_neon_st2lane:
   case Intrinsic::aarch64_neon_st3lane:
   case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
+    // all the vector type is same
+    Type *VecTy = I.getArgOperand(0)->getType();
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+
     for (const Value *Arg : I.args()) {
       Type *ArgTy = Arg->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+      NumElts += 1;
     }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bd03ffaafab10..30bd580ad86a7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5433,8 +5433,8 @@ static bool getFNEGPatterns(MachineInstr &Root,
   auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
     MachineOperand &MO = Root.getOperand(1);
     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
-    if (MI != nullptr && MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
-        (MI->getOpcode() == Opcode) &&
+    if (MI != nullptr && (MI->getOpcode() == Opcode) &&
+        MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0f3d346176780..9e72d37880c58 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2253,7 +2253,7 @@ def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
 // Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
 // $Rn_wback is one past the end of the range. $Rm is the loop counter.
-let isCodeGenOnly=1, mayStore=1 in {
+let isCodeGenOnly=1, mayStore=1, Defs=[NZCV] in {
 def STGloop_wback
     : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
              [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 59f1e8319ae72..d10bba26023ff 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -572,6 +572,15 @@ bool tryToFoldBNEOnCmpXchgResult(MachineBasicBlock &MBB,
   if (!(BNEOp0 == DestReg && BNEOp1 == CmpValReg) &&
       !(BNEOp0 == CmpValReg && BNEOp1 == DestReg))
     return false;
+
+  // Make sure the branch is the only user of the AND.
+  if (MaskReg.isValid()) {
+    if (BNEOp0 == DestReg && !MBBI->getOperand(0).isKill())
+      return false;
+    if (BNEOp1 == DestReg && !MBBI->getOperand(1).isKill())
+      return false;
+  }
+
   ToErase.push_back(&*MBBI);
   LoopHeadBNETarget = MBBI->getOperand(2).getMBB();
   MBBI = skipDebugInstructionsForward(std::next(MBBI), E);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index cb2a49db92332..f312cc8129ddf 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -772,7 +772,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FirstSPAdjustAmount)
     StackSize = FirstSPAdjustAmount;
 
-  if (RVFI->isPushable(MF) && MBBI->getOpcode() == RISCV::CM_POP) {
+  if (RVFI->isPushable(MF) && MBBI != MBB.end() &&
+      MBBI->getOpcode() == RISCV::CM_POP) {
     // Use available stack adjustment in pop instruction to deallocate stack
     // space.
     unsigned PushStack = RVFI->getRVPushRegs() * (STI.getXLen() / 8);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cafce628cf6a2..aa20409da4e2b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3414,6 +3414,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
 
   // Because N and True must have the same merge operand (or True's operand is
   // implicit_def), the "effective" body is the minimum of their VLs.
+  SDValue OrigVL = VL;
   VL = GetMinVL(TrueVL, VL);
   if (!VL)
     return false;
@@ -3461,7 +3462,17 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
          "Expected instructions with mask have a tied dest.");
 #endif
 
-  uint64_t Policy = isImplicitDef(Merge) ? RISCVII::TAIL_AGNOSTIC : /*TUMU*/ 0;
+  // Use a tumu policy, relaxing it to tail agnostic provided that the merge
+  // operand is undefined.
+  //
+  // However, if the VL became smaller than what the vmerge had originally, then
+  // elements past VL that were previously in the vmerge's body will have moved
+  // to the tail. In that case we always need to use tail undisturbed to
+  // preserve them.
+  bool MergeVLShrunk = VL != OrigVL;
+  uint64_t Policy = (isImplicitDef(Merge) && !MergeVLShrunk)
+                        ? RISCVII::TAIL_AGNOSTIC
+                        : /*TUMU*/ 0;
   SDValue PolicyOp =
     CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT());
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4398f996c9306..b402db9c4c170 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1648,7 +1648,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_ROUND,           VT, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
     }
-    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
       setOperationAction(ISD::FP_EXTEND,          VT, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
     }
@@ -1656,9 +1656,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
-
-    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1975,8 +1972,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setF16Action(MVT::v32f16, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
-    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
@@ -2197,9 +2194,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
       setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
 
@@ -2249,9 +2246,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
@@ -2275,8 +2272,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   if (!Subtarget.useSoftFloat() &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
-    addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
-    addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
+    addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
+                                                        : &X86::VR128RegClass);
+    addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
+                                                         : &X86::VR256RegClass);
     // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
     // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
     // Set the operation action Custom to do the customization later.
@@ -2291,6 +2290,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
+    setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
   }
 
@@ -2302,6 +2302,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
     setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
   }
 
@@ -11363,7 +11364,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
 
-  if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
@@ -14795,13 +14797,9 @@ static bool isShuffleFoldableLoad(SDValue V) {
 }
 
 template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
-  return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
-  return ::isSoftFP16(VT, Subtarget);
+static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
+  T EltVT = VT.getScalarType();
+  return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
@@ -14817,7 +14815,7 @@ static SDValue lowerShuffleAsElementInsertion(
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltBits = VT.getScalarSizeInBits();
 
-  if (isSoftFP16(EltVT, Subtarget))
+  if (isSoftF16(EltVT, Subtarget))
     return SDValue();
 
   int V2Index =
@@ -20374,7 +20372,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeVectorElementTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
                                           DAG.getBitcast(NVT, LHS),
@@ -21852,7 +21850,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (isSoftFP16(VT))
+  if (isSoftF16(VT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, true, Subtarget))
     return Op;
@@ -22357,7 +22355,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DstVT == MVT::f128)
     return SDValue();
 
-  if (isSoftFP16(DstVT))
+  if (isSoftF16(DstVT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, false, Subtarget))
     return Op;
@@ -23314,7 +23312,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   SDValue Res;
-  if (isSoftFP16(SrcVT)) {
+  if (isSoftF16(SrcVT, Subtarget)) {
     MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
     if (IsStrict)
       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
@@ -23743,7 +23741,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
-  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
+  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
     return SDValue();
 
   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -23888,6 +23886,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
                           !Subtarget.getTargetTriple().isOSDarwin()))
     return SDValue();
 
+  if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
+      (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
+    return Op;
+
   if (SVT == MVT::f16) {
     if (Subtarget.hasFP16())
       return Op;
@@ -23960,7 +23962,25 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   if (!SVT.isVector())
     return Op;
 
+  if (SVT.getVectorElementType() == MVT::bf16) {
+    // FIXME: Do we need to support strict FP?
+    assert(!IsStrict && "Strict FP doesn't support BF16");
+    if (VT.getVectorElementType() == MVT::f64) {
+      MVT TmpVT = VT.changeVectorElementType(MVT::f32);
+      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+                         DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
+    }
+    assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
+    MVT NVT = SVT.changeVectorElementType(MVT::i32);
+    In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
+    In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
+    In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
+    return DAG.getBitcast(VT, In);
+  }
+
   if (SVT.getVectorElementType() == MVT::f16) {
+    if (Subtarget.hasFP16() && isTypeLegal(SVT))
+      return Op;
     assert(Subtarget.hasF16C() && "Unexpected features!");
     if (SVT == MVT::v2f16)
       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
@@ -24033,6 +24053,12 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     return Res;
   }
 
+  if (VT.getScalarType() == MVT::bf16) {
+    if (SVT.getScalarType() == MVT::f32 && isTypeLegal(VT))
+      return Op;
+    return SDValue();
+  }
+
   if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
     if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
       return SDValue();
@@ -25676,7 +25702,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (isFP) {
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-    if (isSoftFP16(EltVT, Subtarget))
+    if (isSoftF16(EltVT, Subtarget))
       return SDValue();
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
@@ -26241,7 +26267,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC =
       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
 
-  if (isSoftFP16(Op0.getValueType()))
+  if (isSoftF16(Op0.getValueType(), Subtarget))
     return SDValue();
 
   // Handle f128 first, since one possible outcome is a normal integer
@@ -26434,7 +26460,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
                                           DAG.getBitcast(NVT, Op1),
@@ -26506,7 +26532,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (Cond.getOpcode() == ISD::SETCC &&
-      !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
+      !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
@@ -27196,7 +27222,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // Bail out when we don't have native compare instructions.
   if (Cond.getOpcode() == ISD::SETCC &&
       Cond.getOperand(0).getValueType() != MVT::f128 &&
-      !isSoftFP16(Cond.getOperand(0).getValueType())) {
+      !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -34983,7 +35009,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SrcVT = Src.getValueType();
 
     SDValue Res;
-    if (isSoftFP16(SrcVT)) {
+    if (isSoftF16(SrcVT, Subtarget)) {
       EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
       if (IsStrict) {
         Res =
@@ -47383,7 +47409,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
+      VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
@@ -47700,7 +47726,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   }
 
   // Early exit check
-  if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
+  if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
     return SDValue();
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -54550,7 +54576,7 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
+  if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 250df82a30c2f..047d8f0210470 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1749,8 +1749,6 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
-    template<typename T> bool isSoftFP16(T VT) const;
-
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index ecb5c3e912401..b5dac7a0c65af 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12976,6 +12976,11 @@ let Predicates = [HasBF16, HasVLX] in {
   def : Pat<(v16bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZ256rr VR128X:$src)>;
 
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256X:$src))),
+            (VCVTNEPS2BF16Z256rr VR256X:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Z256rm addr:$src)>;
+
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
@@ -12985,6 +12990,11 @@ let Predicates = [HasBF16] in {
 
   def : Pat<(v32bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZrr VR128X:$src)>;
+
+  def : Pat<(v16bf16 (X86vfpround (v16f32 VR512:$src))),
+            (VCVTNEPS2BF16Zrr VR512:$src)>;
+  def : Pat<(v16bf16 (X86vfpround (loadv16f32 addr:$src))),
+            (VCVTNEPS2BF16Zrm addr:$src)>;
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 6c57eceab3769..a6fcc804e1d06 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8289,6 +8289,11 @@ let Predicates = [HasAVXNECONVERT] in {
        f256mem>, T8PS;
   let checkVEXPredicate = 1 in
   defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
+
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
+            (VCVTNEPS2BF16Yrr VR256:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Yrm addr:$src)>;
 }
 
 def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index d78ab1c1ea284..d0606c15f3d5b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -194,12 +194,49 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
   for (auto *DA : It->second)
     Visited.insert(DA->getParent());
 
+  SmallPtrSet<const BasicBlock *, 32> EscapingBBs;
+  for (auto *U : CB->users()) {
+    // The use from coroutine intrinsics are not a problem.
+    if (isa<CoroFreeInst, CoroSubFnInst, CoroSaveInst>(U))
+      continue;
+
+    // Think all other usages may be an escaping candidate conservatively.
+    //
+    // Note that the major user of switch ABI coroutine (the C++) will store
+    // resume.fn, destroy.fn and the index to the coroutine frame immediately.
+    // So the parent of the coro.begin in C++ will be always escaping.
+    // Then we can't get any performance benefits for C++ by improving the
+    // precision of the method.
+    //
+    // The reason why we still judge it is we want to make LLVM Coroutine in
+    // switch ABIs to be self contained as much as possible instead of a
+    // by-product of C++20 Coroutines.
+    EscapingBBs.insert(cast<Instruction>(U)->getParent());
+  }
+
+  bool PotentiallyEscaped = false;
+
   do {
     const auto *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (TIs.count(BB))
-      return true;
+
+    // A Path insensitive marker to test whether the coro.begin escapes.
+    // It is intentional to make it path insensitive while it may not be
+    // precise since we don't want the process to be too slow.
+    PotentiallyEscaped |= EscapingBBs.count(BB);
+
+    if (TIs.count(BB)) {
+      if (!BB->getTerminator()->isExceptionalTerminator() || PotentiallyEscaped)
+        return true;
+
+      // If the function ends with the exceptional terminator, the memory used
+      // by the coroutine frame can be released by stack unwinding
+      // automatically. So we can think the coro.begin doesn't escape if it
+      // exits the function by exceptional terminator.
+
+      continue;
+    }
 
     // Conservatively say that there is potentially a path.
     if (!--Limit)
@@ -236,36 +273,36 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // memory location storing that value and not the virtual register.
 
   SmallPtrSet<BasicBlock *, 8> Terminators;
-  // First gather all of the non-exceptional terminators for the function.
+  // First gather all of the terminators for the function.
   // Consider the final coro.suspend as the real terminator when the current
   // function is a coroutine.
-    for (BasicBlock &B : *F) {
-      auto *TI = B.getTerminator();
-      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-          !isa<UnreachableInst>(TI))
-        Terminators.insert(&B);
-    }
+  for (BasicBlock &B : *F) {
+    auto *TI = B.getTerminator();
+
+    if (TI->getNumSuccessors() != 0 || isa<UnreachableInst>(TI))
+      continue;
+
+    Terminators.insert(&B);
+  }
 
   // Filter out the coro.destroy that lie along exceptional paths.
   SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
   for (const auto &It : DestroyAddr) {
-    // If there is any coro.destroy dominates all of the terminators for the
-    // coro.begin, we could know the corresponding coro.begin wouldn't escape.
-    for (Instruction *DA : It.second) {
-      if (llvm::all_of(Terminators, [&](auto *TI) {
-            return DT.dominates(DA, TI->getTerminator());
-          })) {
-        ReferencedCoroBegins.insert(It.first);
-        break;
-      }
-    }
-
-    // Whether there is any paths from coro.begin to Terminators which not pass
-    // through any of the coro.destroys.
+    // If every terminators is dominated by coro.destroy, we could know the
+    // corresponding coro.begin wouldn't escape.
+    //
+    // Otherwise hasEscapePath would decide whether there is any paths from
+    // coro.begin to Terminators which not pass through any of the
+    // coro.destroys.
     //
     // hasEscapePath is relatively slow, so we avoid to run it as much as
     // possible.
-    if (!ReferencedCoroBegins.count(It.first) &&
+    if (llvm::all_of(Terminators,
+                     [&](auto *TI) {
+                       return llvm::any_of(It.second, [&](auto *DA) {
+                         return DT.dominates(DA, TI->getTerminator());
+                       });
+                     }) ||
         !hasEscapePath(It.first, Terminators))
       ReferencedCoroBegins.insert(It.first);
   }
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 15628d32280d8..2b88dd08d88b6 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -424,7 +424,7 @@ static Decomposition decompose(Value *V,
       return MergeResults(Op0, Op1, IsSigned);
 
     ConstantInt *CI;
-    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI)))) {
+    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) {
       auto Result = decompose(Op0, Preconditions, IsSigned, DL);
       Result.mul(CI->getSExtValue());
       return Result;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 24390f1b54f60..5b8f1b00dc034 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1269,6 +1269,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     if (IsLoadCSE) {
       LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
       combineMetadataForCSE(NLoadI, LoadI, false);
+      LVI->forgetValue(NLoadI);
     };
 
     // If the returned value is the load itself, replace with poison. This can
@@ -1461,6 +1462,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
   for (LoadInst *PredLoadI : CSELoads) {
     combineMetadataForCSE(PredLoadI, LoadI, true);
+    LVI->forgetValue(PredLoadI);
   }
 
   LoadI->replaceAllUsesWith(PN);
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
new file mode 100644
index 0000000000000..7642597c91f2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s
+
+; st2 must before two ldrb.
+; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane,
+; which lead to basic-aa goes wrong.
+
+define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr {
+; CHECK-LABEL:   test_vst2_lane_u8:
+; CHECK:         st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8]
+; CHECK-NEXT:    umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6]
+; CHECK-NEXT:    ldrb w[[W2:[0-9]+]], [sp, #12]
+; CHECK-NEXT:    ldrb w[[W2:[0-9]+]], [sp, #13]
+entry:
+  %temp = alloca [2 x i8], align 4
+  %vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0
+  %vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1
+  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp)
+  %0 = load i8, ptr %temp, align 4
+  %vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6
+  %cmp8.not = icmp ne i8 %0, %vget_lane
+  %arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1
+  %1 = load i8, ptr %arrayidx3.1, align 1
+  %vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6
+  %cmp8.not.1 = icmp ne i8 %1, %vget_lane.1
+  %or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1
+  %cmp.lcssa = zext i1 %or.cond to i32
+  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4
+  ret i32 %cmp.lcssa
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
new file mode 100644
index 0000000000000..6fe094cc6cbb4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -run-pass=machine-combiner -o - %s | FileCheck %s
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64"
+
+  @c = global double 0.000000e+00, align 8
+
+  define void @emit_fneg_with_non_register_operand(double %c) {
+  entry:
+    %0 = load double, ptr @c, align 8
+    %1 = tail call double asm sideeffect "", "=w,0"(double %0)
+    %2 = load double, ptr @c, align 8
+    %3 = tail call double asm sideeffect "", "=w,0"(double %2)
+    %fneg = fneg double %1
+    %cmp = fcmp oeq double %3, %fneg
+    br i1 %cmp, label %if.then, label %if.end
+
+  if.then:                                          ; preds = %entry
+    tail call void @b(double noundef %1)
+    ret void
+
+  if.end:                                           ; preds = %entry
+    ret void
+  }
+
+  declare void @b(double noundef)
+
+...
+---
+name:            emit_fneg_with_non_register_operand
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: fpr64, preferred-register: '' }
+  - { id: 1, class: fpr64, preferred-register: '' }
+  - { id: 2, class: fpr64, preferred-register: '' }
+  - { id: 3, class: fpr64, preferred-register: '' }
+  - { id: 4, class: fpr64, preferred-register: '' }
+  - { id: 5, class: fpr64, preferred-register: '' }
+  - { id: 6, class: gpr64common, preferred-register: '' }
+  - { id: 7, class: fpr64, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     true
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: emit_fneg_with_non_register_operand
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c
+  ; CHECK-NEXT:   [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fpr64 = COPY %2
+  ; CHECK-NEXT:   [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
+  ; CHECK-NEXT:   [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2
+  ; CHECK-NEXT:   nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.2, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   $d0 = COPY [[COPY]]
+  ; CHECK-NEXT:   TCRETURNdi @b, 0, csr_aarch64_aapcs, implicit $sp, implicit $d0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+
+    %6:gpr64common = LOADgot target-flags(aarch64-got) @c
+    %3:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3)
+    %0:fpr64 = COPY %2
+    %5:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3)
+    %7:fpr64 = FNEGDr %2
+    nofpexcept FCMPDrr %4, killed %7, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.1.if.then:
+    $d0 = COPY %0
+    TCRETURNdi @b, 0, csr_aarch64_aapcs, implicit $sp, implicit $d0
+
+  bb.2.if.end:
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll b/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll
new file mode 100644
index 0000000000000..86bafd1c93bc1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O2 -print-after-isel -mtriple=aarch64-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=CHECK
+
+; This test function includes a 256-byte buffer. We expect it to require its
+; MTE tags to be set to a useful value on entry, and cleared again on exit. At
+; the time of writing this test, the pseudo-instructions chosen are
+; STGloop_wback and STGloop respectively, but if different pseudos are selected
+; in future, that's not a problem. The important thing is that both should
+; include that implicit-def of $nzcv, because these pseudo-instructions will
+; expand into loops that use the flags for their termination tests.
+
+; CHECK: STGloop_wback 256, {{.*}}, implicit-def dead $nzcv
+; CHECK: STGloop       256, {{.*}}, implicit-def dead $nzcv
+
+define i32 @foo(i32 noundef %0) #0 {
+  %2 = alloca i32, align 4
+  %3 = alloca [256 x i8], align 1
+  %4 = alloca i64, align 8
+  %5 = alloca i32, align 4
+  %6 = alloca i64, align 8
+  store i32 %0, ptr %2, align 4
+  %7 = load i32, ptr %2, align 4
+  %8 = getelementptr inbounds [256 x i8], ptr %3, i64 0, i64 0
+  %9 = call i64 @read(i32 noundef %7, ptr noundef %8, i64 noundef 256)
+  store i64 %9, ptr %4, align 8
+  store i32 0, ptr %5, align 4
+  store i64 0, ptr %6, align 8
+  br label %10
+
+10:                                               ; preds = %21, %1
+  %11 = load i64, ptr %6, align 8
+  %12 = load i64, ptr %4, align 8
+  %13 = icmp ult i64 %11, %12
+  br i1 %13, label %14, label %24
+
+14:                                               ; preds = %10
+  %15 = load i64, ptr %6, align 8
+  %16 = getelementptr inbounds [256 x i8], ptr %3, i64 0, i64 %15
+  %17 = load i8, ptr %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = load i32, ptr %5, align 4
+  %20 = add nsw i32 %19, %18
+  store i32 %20, ptr %5, align 4
+  br label %21
+
+21:                                               ; preds = %14
+  %22 = load i64, ptr %6, align 8
+  %23 = add i64 %22, 1
+  store i64 %23, ptr %6, align 8
+  br label %10
+
+24:                                               ; preds = %10
+  %25 = load i32, ptr %5, align 4
+  %26 = srem i32 %25, 251
+  ret i32 %26
+}
+
+declare i64 @read(i32 noundef, ptr noundef, i64 noundef)
+
+attributes #0 = { sanitize_memtag "target-features"="+mte" }
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
new file mode 100644
index 0000000000000..ecb953366a88e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
@@ -0,0 +1,106 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) {
+  ; CHECK-LABEL: name: test_ld2
+  ; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) {
+  ; CHECK-LABEL: name: test_ld3
+  ; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) {
+  ; CHECK-LABEL: name: test_ld4
+  ; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x2
+  ; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x3
+  ; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x4
+  ; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld2r
+  ; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld3r
+  ; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld4r
+  ; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) {
+  ; CHECK-LABEL: name: test_ld2lane
+  ; CHECK: {{.*}} LD2i32 {{.*}}
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) {
+  ; CHECK-LABEL: name: test_ld3lane
+  ; CHECK: {{.*}} LD3i32 {{.*}}
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) {
+  ; CHECK-LABEL: name: test_ld4lane
+  ; CHECK: {{.*}} LD4i32 {{.*}}
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
index 5763ec61667f2..3710db9c47ff6 100644
--- a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
+++ b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -23,8 +23,6 @@ define void @addstx(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
 ; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}})
   tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -46,8 +44,6 @@ define void @addst1x(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
 ; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}})
   tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -69,14 +65,12 @@ define void @addstxlane(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res)
-; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}})
+; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}})
   tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res)
-; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}})
+; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}})
   tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res)
-; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}})
+; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}})
 
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll
new file mode 100644
index 0000000000000..dcd71edc460b8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr65025.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=riscv64 -mattr=+a | FileCheck %s
+
+define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %val) nounwind {
+; CHECK-LABEL: cmpxchg_masked_and_branch1:
+; CHECK:       # %bb.0: # %do_cmpxchg
+; CHECK-NEXT:    andi a3, a0, -4
+; CHECK-NEXT:    slli a4, a0, 3
+; CHECK-NEXT:    li a5, 255
+; CHECK-NEXT:    sllw a5, a5, a4
+; CHECK-NEXT:    andi a1, a1, 255
+; CHECK-NEXT:    sllw a1, a1, a4
+; CHECK-NEXT:    andi a2, a2, 255
+; CHECK-NEXT:    sllw a2, a2, a4
+; CHECK-NEXT:  .LBB0_3: # %do_cmpxchg
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lr.w.aqrl a4, (a3)
+; CHECK-NEXT:    and a6, a4, a5
+; CHECK-NEXT:    bne a6, a1, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %do_cmpxchg
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    xor a6, a4, a2
+; CHECK-NEXT:    and a6, a6, a5
+; CHECK-NEXT:    xor a6, a4, a6
+; CHECK-NEXT:    sc.w.rl a6, a6, (a3)
+; CHECK-NEXT:    bnez a6, .LBB0_3
+; CHECK-NEXT:  .LBB0_5: # %do_cmpxchg
+; CHECK-NEXT:    and a2, a4, a5
+; CHECK-NEXT:    bne a1, a2, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %returnptr
+; CHECK-NEXT:    xor a1, a1, a2
+; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %exit
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+do_cmpxchg:
+  %0 = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst seq_cst
+  %1 = extractvalue { i8, i1 } %0, 1
+  %2 = select i1 %1, ptr %ptr, ptr null
+  br i1 %1, label %returnptr, label %exit
+returnptr:
+  ret ptr %2
+exit:
+  ret ptr null
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index e74daee7cdddd..35d9b27c75f7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -28,7 +28,7 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV32-NEXT:    vle32.v v8, (a0), v0.t
 ; RV32-NEXT:    vse32.v v8, (a3)
 ; RV32-NEXT:    ret
@@ -58,7 +58,7 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV64-NEXT:    vle32.v v8, (a0), v0.t
 ; RV64-NEXT:    vse32.v v8, (a3)
 ; RV64-NEXT:    ret
@@ -239,7 +239,7 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV32-NEXT:    vle32.v v8, (a0), v0.t
 ; RV32-NEXT:    vse32.v v8, (a3)
 ; RV32-NEXT:    ret
@@ -269,7 +269,7 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV64-NEXT:    vle32.v v8, (a0), v0.t
 ; RV64-NEXT:    vse32.v v8, (a3)
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 7620ba5310720..3c6515595b642 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1065,3 +1065,15 @@ define <vscale x 2 x i32> @vmerge_larger_vl_poison_passthru(<vscale x 2 x i32> %
   ret <vscale x 2 x i32> %b
 }
 
+; The vadd's new policy should be tail undisturbed since the false op of the
+; vmerge moves from the the body to the tail, and we need to preserve it.
+define <vscale x 2 x i32> @vmerge_larger_vl_false_becomes_tail(<vscale x 2 x i32> %false, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: vmerge_larger_vl_false_becomes_tail:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, mu
+; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, i64 2)
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %false, <vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i64 3)
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir b/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir
new file mode 100644
index 0000000000000..64556ec0b343a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# REQUIRES: asserts
+# RUN: llc  %s -o - -mtriple=riscv32 -mattr=+zcmp -target-abi ilp32 -run-pass=prologepilog \
+# RUN:   -simplify-mir -verify-machineinstrs | FileCheck %s
+
+--- |
+  define hidden void @f(fp128 %a) local_unnamed_addr #0 {
+  entry:
+    %0 = bitcast fp128 %a to i128
+    %and.i = lshr i128 %0, 112
+    %1 = trunc i128 %and.i to i32
+    %2 = and i32 %1, 32767
+    %or.i = or i128 poison, 5192296858534827628530496329220096
+    br label %if.end.i
+
+  if.end.i:                                         ; preds = %entry
+    br i1 poison, label %exit, label %if.then12.i
+
+  if.then12.i:                                      ; preds = %if.end.i
+    %sub13.i = sub nuw nsw i32 16495, %2
+    %sh_prom.i = zext i32 %sub13.i to i128
+    %shr14.i = lshr i128 %or.i, %sh_prom.i
+    %conv15.i = trunc i128 %shr14.i to i32
+    br label %exit
+
+  exit:                                             ; preds = %if.then12.i, %if.end.i
+    %retval.0.i = phi i32 [ %conv15.i, %if.then12.i ], [ -1, %if.end.i ]
+    ret void
+  }
+...
+---
+name:            f
+alignment:       2
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$x10' }
+frameInfo:
+  maxAlignment:    1
+  localFrameSize:  32
+  savePoint:       '%bb.2'
+  restorePoint:    '%bb.2'
+stack:
+  - { id: 0, size: 32, alignment: 1, local-offset: -32 }
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  ; CHECK-LABEL: name: f
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x10 = ADDI $x0, -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.end.i:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BNE $x0, $x0, %bb.3
+  ; CHECK-NEXT:   PseudoBR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then12.i:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -32
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 32
+  ; CHECK-NEXT:   SB $x0, $x2, 31 :: (store (s8) into %stack.0 + 31)
+  ; CHECK-NEXT:   SB $x0, $x2, 30 :: (store (s8) into %stack.0 + 30)
+  ; CHECK-NEXT:   SB $x0, $x2, 29 :: (store (s8) into %stack.0 + 29)
+  ; CHECK-NEXT:   SB $x0, $x2, 28 :: (store (s8) into %stack.0 + 28)
+  ; CHECK-NEXT:   SB $x0, $x2, 27 :: (store (s8) into %stack.0 + 27)
+  ; CHECK-NEXT:   SB $x0, $x2, 26 :: (store (s8) into %stack.0 + 26)
+  ; CHECK-NEXT:   SB $x0, $x2, 25 :: (store (s8) into %stack.0 + 25)
+  ; CHECK-NEXT:   SB $x0, $x2, 24 :: (store (s8) into %stack.0 + 24)
+  ; CHECK-NEXT:   SB $x0, $x2, 23 :: (store (s8) into %stack.0 + 23)
+  ; CHECK-NEXT:   SB $x0, $x2, 22 :: (store (s8) into %stack.0 + 22)
+  ; CHECK-NEXT:   SB $x0, $x2, 21 :: (store (s8) into %stack.0 + 21)
+  ; CHECK-NEXT:   SB $x0, $x2, 20 :: (store (s8) into %stack.0 + 20)
+  ; CHECK-NEXT:   SB $x0, $x2, 19 :: (store (s8) into %stack.0 + 19)
+  ; CHECK-NEXT:   SB $x0, $x2, 18 :: (store (s8) into %stack.0 + 18)
+  ; CHECK-NEXT:   SB $x0, $x2, 17 :: (store (s8) into %stack.0 + 17)
+  ; CHECK-NEXT:   SB $x0, $x2, 16 :: (store (s8) into %stack.0 + 16)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 0 :: (store (s8) into %stack.0)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 4 :: (store (s8) into %stack.0 + 4)
+  ; CHECK-NEXT:   renamable $x11 = SRLI renamable $x10, 24
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 3 :: (store (s8) into %stack.0 + 3)
+  ; CHECK-NEXT:   renamable $x12 = SRLI renamable $x10, 16
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 2 :: (store (s8) into %stack.0 + 2)
+  ; CHECK-NEXT:   renamable $x13 = SRLI renamable $x10, 8
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 1 :: (store (s8) into %stack.0 + 1)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 8 :: (store (s8) into %stack.0 + 8)
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 7 :: (store (s8) into %stack.0 + 7)
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 6 :: (store (s8) into %stack.0 + 6)
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 5 :: (store (s8) into %stack.0 + 5)
+  ; CHECK-NEXT:   SB killed renamable $x10, $x2, 12 :: (store (s8) into %stack.0 + 12)
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 11 :: (store (s8) into %stack.0 + 11)
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 10 :: (store (s8) into %stack.0 + 10)
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 9 :: (store (s8) into %stack.0 + 9)
+  ; CHECK-NEXT:   SB killed renamable $x11, $x2, 15 :: (store (s8) into %stack.0 + 15)
+  ; CHECK-NEXT:   SB killed renamable $x12, $x2, 14 :: (store (s8) into %stack.0 + 14)
+  ; CHECK-NEXT:   SB killed renamable $x13, $x2, 13 :: (store (s8) into %stack.0 + 13)
+  ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x2, 32
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.exit:
+  ; CHECK-NEXT:   PseudoRET
+  bb.0.entry:
+    liveins: $x10
+
+    renamable $x10 = ADDI $x0, -1
+
+  bb.1.if.end.i:
+    liveins: $x10
+
+    BNE $x0, $x0, %bb.3
+    PseudoBR %bb.2
+
+  bb.2.if.then12.i:
+    liveins: $x10
+
+    SB $x0, %stack.0, 31 :: (store (s8) into %stack.0 + 31)
+    SB $x0, %stack.0, 30 :: (store (s8) into %stack.0 + 30)
+    SB $x0, %stack.0, 29 :: (store (s8) into %stack.0 + 29)
+    SB $x0, %stack.0, 28 :: (store (s8) into %stack.0 + 28)
+    SB $x0, %stack.0, 27 :: (store (s8) into %stack.0 + 27)
+    SB $x0, %stack.0, 26 :: (store (s8) into %stack.0 + 26)
+    SB $x0, %stack.0, 25 :: (store (s8) into %stack.0 + 25)
+    SB $x0, %stack.0, 24 :: (store (s8) into %stack.0 + 24)
+    SB $x0, %stack.0, 23 :: (store (s8) into %stack.0 + 23)
+    SB $x0, %stack.0, 22 :: (store (s8) into %stack.0 + 22)
+    SB $x0, %stack.0, 21 :: (store (s8) into %stack.0 + 21)
+    SB $x0, %stack.0, 20 :: (store (s8) into %stack.0 + 20)
+    SB $x0, %stack.0, 19 :: (store (s8) into %stack.0 + 19)
+    SB $x0, %stack.0, 18 :: (store (s8) into %stack.0 + 18)
+    SB $x0, %stack.0, 17 :: (store (s8) into %stack.0 + 17)
+    SB $x0, %stack.0, 16 :: (store (s8) into %stack.0 + 16)
+    SB renamable $x10, %stack.0, 0 :: (store (s8) into %stack.0)
+    SB renamable $x10, %stack.0, 4 :: (store (s8) into %stack.0 + 4)
+    renamable $x11 = SRLI renamable $x10, 24
+    SB renamable $x11, %stack.0, 3 :: (store (s8) into %stack.0 + 3)
+    renamable $x12 = SRLI renamable $x10, 16
+    SB renamable $x12, %stack.0, 2 :: (store (s8) into %stack.0 + 2)
+    renamable $x13 = SRLI renamable $x10, 8
+    SB renamable $x13, %stack.0, 1 :: (store (s8) into %stack.0 + 1)
+    SB renamable $x10, %stack.0, 8 :: (store (s8) into %stack.0 + 8)
+    SB renamable $x11, %stack.0, 7 :: (store (s8) into %stack.0 + 7)
+    SB renamable $x12, %stack.0, 6 :: (store (s8) into %stack.0 + 6)
+    SB renamable $x13, %stack.0, 5 :: (store (s8) into %stack.0 + 5)
+    SB killed renamable $x10, %stack.0, 12 :: (store (s8) into %stack.0 + 12)
+    SB renamable $x11, %stack.0, 11 :: (store (s8) into %stack.0 + 11)
+    SB renamable $x12, %stack.0, 10 :: (store (s8) into %stack.0 + 10)
+    SB renamable $x13, %stack.0, 9 :: (store (s8) into %stack.0 + 9)
+    SB killed renamable $x11, %stack.0, 15 :: (store (s8) into %stack.0 + 15)
+    SB killed renamable $x12, %stack.0, 14 :: (store (s8) into %stack.0 + 14)
+    SB killed renamable $x13, %stack.0, 13 :: (store (s8) into %stack.0 + 13)
+
+  bb.3.exit:
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
index 293a67e59e0c9..b311c8831457b 100644
--- a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
@@ -198,7 +198,6 @@ define <8 x bfloat> @test_int_x86_vcvtneps2bf16128(<4 x float> %A) {
 ; CHECK-LABEL: test_int_x86_vcvtneps2bf16128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x7a,0x72,0xc0]
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm0
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x bfloat> @llvm.x86.vcvtneps2bf16128(<4 x float> %A)
   ret <8 x bfloat> %ret
@@ -209,7 +208,6 @@ define <8 x bfloat> @test_int_x86_vcvtneps2bf16256(<8 x float> %A) {
 ; CHECK-LABEL: test_int_x86_vcvtneps2bf16256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0xc4,0xe2,0x7e,0x72,0xc0]
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm0
 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x bfloat> @llvm.x86.vcvtneps2bf16256(<8 x float> %A)
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index f680a39a482ec..7a82515ad24b7 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16 | FileCheck %s --check-prefixes=CHECK,BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
 
 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-LABEL: add:
@@ -20,22 +22,22 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rdx, %rbx
-; BF16-NEXT:    movzwl (%rsi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    movzwl (%rdi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    movw %ax, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; AVX-LABEL: add:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    movzwl (%rsi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = load bfloat, ptr %pa
   %b = load bfloat, ptr %pb
   %add = fadd bfloat %a, %b
@@ -58,19 +60,19 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rax
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    vmovd %xmm1, %ecx
-; BF16-NEXT:    shll $16, %ecx
-; BF16-NEXT:    vmovd %ecx, %xmm0
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    popq %rax
-; BF16-NEXT:    retq
+; AVX-LABEL: add2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovd %xmm1, %ecx
+; AVX-NEXT:    shll $16, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    retq
   %add = fadd bfloat %a, %b
   ret bfloat %add
 }
@@ -105,34 +107,34 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbp
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_double:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbp
-; BF16-NEXT:    pushq %r14
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rdx, %rbx
-; BF16-NEXT:    movq %rsi, %r14
-; BF16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %ebp
-; BF16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    shll $16, %ebp
-; BF16-NEXT:    vmovd %ebp, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    vmovsd %xmm0, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    popq %r14
-; BF16-NEXT:    popq %rbp
-; BF16-NEXT:    retq
+; AVX-LABEL: add_double:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    movq %rsi, %r14
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %ebp
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    shll $16, %ebp
+; AVX-NEXT:    vmovd %ebp, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd %xmm0, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    retq
   %la = load double, ptr %pa
   %a = fptrunc double %la to bfloat
   %lb = load double, ptr %pb
@@ -169,30 +171,30 @@ define double @add_double2(double %da, double %db) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_double2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    subq $16, %rsp
-; BF16-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %ebx
-; BF16-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; BF16-NEXT:    # xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    shll $16, %ebx
-; BF16-NEXT:    vmovd %ebx, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    addq $16, %rsp
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; AVX-LABEL: add_double2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %ebx
+; AVX-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    shll $16, %ebx
+; AVX-NEXT:    vmovd %ebx, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    addq $16, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = fptrunc double %da to bfloat
   %b = fptrunc double %db to bfloat
   %add = fadd bfloat %a, %b
@@ -215,19 +217,19 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_constant:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rsi, %rbx
-; BF16-NEXT:    movzwl (%rdi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    movw %ax, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; AVX-LABEL: add_constant:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rsi, %rbx
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = load bfloat, ptr %pa
   %add = fadd bfloat %a, 1.0
   store bfloat %add, ptr %pc
@@ -246,16 +248,16 @@ define bfloat @add_constant2(bfloat %a) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_constant2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rax
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    popq %rax
-; BF16-NEXT:    retq
+; AVX-LABEL: add_constant2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    retq
   %add = fadd bfloat %a, 1.0
   ret bfloat %add
 }
@@ -540,6 +542,235 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
 ; BF16-NEXT:    popq %r15
 ; BF16-NEXT:    popq %rbp
 ; BF16-NEXT:    retq
+;
+; FP16-LABEL: addv:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rbp
+; FP16-NEXT:    pushq %r15
+; FP16-NEXT:    pushq %r14
+; FP16-NEXT:    pushq %r13
+; FP16-NEXT:    pushq %r12
+; FP16-NEXT:    pushq %rbx
+; FP16-NEXT:    subq $40, %rsp
+; FP16-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; FP16-NEXT:    vmovw %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm2
+; FP16-NEXT:    vmovw %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm2, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $7, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $7, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %ebp
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $6, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $6, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r14d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $5, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $5, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r15d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $4, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $4, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r12d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $3, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $3, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r13d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $2, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $2, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %ebx
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $1, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $1, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %eax
+; FP16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; FP16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; FP16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $2, %ebx, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $3, %r13d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $4, %r12d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $5, %r15d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $6, %r14d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $7, %ebp, %xmm0, %xmm0
+; FP16-NEXT:    addq $40, %rsp
+; FP16-NEXT:    popq %rbx
+; FP16-NEXT:    popq %r12
+; FP16-NEXT:    popq %r13
+; FP16-NEXT:    popq %r14
+; FP16-NEXT:    popq %r15
+; FP16-NEXT:    popq %rbp
+; FP16-NEXT:    retq
+;
+; AVXNC-LABEL: addv:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    pushq %r15
+; AVXNC-NEXT:    pushq %r14
+; AVXNC-NEXT:    pushq %r13
+; AVXNC-NEXT:    pushq %r12
+; AVXNC-NEXT:    pushq %rbx
+; AVXNC-NEXT:    subq $40, %rsp
+; AVXNC-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVXNC-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vpextrw $7, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm2, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $6, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %ebp
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $5, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r14d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $4, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r15d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $3, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r12d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $2, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r13d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $1, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %ebx
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vmovd %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    addq $40, %rsp
+; AVXNC-NEXT:    popq %rbx
+; AVXNC-NEXT:    popq %r12
+; AVXNC-NEXT:    popq %r13
+; AVXNC-NEXT:    popq %r14
+; AVXNC-NEXT:    popq %r15
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
   %add = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %add
 }
@@ -554,13 +785,13 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr62997:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vmovd %xmm1, %eax
-; BF16-NEXT:    vmovd %xmm0, %ecx
-; BF16-NEXT:    vmovd %ecx, %xmm0
-; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; BF16-NEXT:    retq
+; AVX-LABEL: pr62997:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    vmovd %xmm0, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
   %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
   ret <2 x bfloat> %2
@@ -575,10 +806,16 @@ define <32 x bfloat> @pr63017() {
 ; SSE2-NEXT:    xorps %xmm3, %xmm3
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr63017:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    retq
+; F16-LABEL: pr63017:
+; F16:       # %bb.0:
+; F16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr63017:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    retq
   ret <32 x bfloat> zeroinitializer
 }
 
@@ -1149,11 +1386,259 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; SSE2-NEXT:    popq %r14
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr63017_2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; BF16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
-; BF16-NEXT:    retq
+; F16-LABEL: pr63017_2:
+; F16:       # %bb.0:
+; F16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; F16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr63017_2:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    vmovdqa %ymm0, %ymm1
+; AVXNC-NEXT:    jne .LBB12_2
+; AVXNC-NEXT:  # %bb.1: # %cond.load
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm0, %xmm0
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_2: # %else
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_4
+; AVXNC-NEXT:  # %bb.3: # %cond.load1
+; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_4: # %else2
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_6
+; AVXNC-NEXT:  # %bb.5: # %cond.load4
+; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_6: # %else5
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_8
+; AVXNC-NEXT:  # %bb.7: # %cond.load7
+; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_8: # %else8
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_10
+; AVXNC-NEXT:  # %bb.9: # %cond.load10
+; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_10: # %else11
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_12
+; AVXNC-NEXT:  # %bb.11: # %cond.load13
+; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_12: # %else14
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_14
+; AVXNC-NEXT:  # %bb.13: # %cond.load16
+; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_14: # %else17
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_16
+; AVXNC-NEXT:  # %bb.15: # %cond.load19
+; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_16: # %else20
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_18
+; AVXNC-NEXT:  # %bb.17: # %cond.load22
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_18: # %else23
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_20
+; AVXNC-NEXT:  # %bb.19: # %cond.load25
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_20: # %else26
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_22
+; AVXNC-NEXT:  # %bb.21: # %cond.load28
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_22: # %else29
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_24
+; AVXNC-NEXT:  # %bb.23: # %cond.load31
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_24: # %else32
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_26
+; AVXNC-NEXT:  # %bb.25: # %cond.load34
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_26: # %else35
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_28
+; AVXNC-NEXT:  # %bb.27: # %cond.load37
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_28: # %else38
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_30
+; AVXNC-NEXT:  # %bb.29: # %cond.load40
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_30: # %else41
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_32
+; AVXNC-NEXT:  # %bb.31: # %cond.load43
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_32: # %else44
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_34
+; AVXNC-NEXT:  # %bb.33: # %cond.load46
+; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_34: # %else47
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_36
+; AVXNC-NEXT:  # %bb.35: # %cond.load49
+; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_36: # %else50
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_38
+; AVXNC-NEXT:  # %bb.37: # %cond.load52
+; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_38: # %else53
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_40
+; AVXNC-NEXT:  # %bb.39: # %cond.load55
+; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_40: # %else56
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_42
+; AVXNC-NEXT:  # %bb.41: # %cond.load58
+; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_42: # %else59
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_44
+; AVXNC-NEXT:  # %bb.43: # %cond.load61
+; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_44: # %else62
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_46
+; AVXNC-NEXT:  # %bb.45: # %cond.load64
+; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_46: # %else65
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_48
+; AVXNC-NEXT:  # %bb.47: # %cond.load67
+; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_48: # %else68
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_50
+; AVXNC-NEXT:  # %bb.49: # %cond.load70
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_50: # %else71
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_52
+; AVXNC-NEXT:  # %bb.51: # %cond.load73
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_52: # %else74
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_54
+; AVXNC-NEXT:  # %bb.53: # %cond.load76
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_54: # %else77
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_56
+; AVXNC-NEXT:  # %bb.55: # %cond.load79
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_56: # %else80
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_58
+; AVXNC-NEXT:  # %bb.57: # %cond.load82
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_58: # %else83
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_60
+; AVXNC-NEXT:  # %bb.59: # %cond.load85
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_60: # %else86
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_62
+; AVXNC-NEXT:  # %bb.61: # %cond.load88
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_62: # %else89
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_64
+; AVXNC-NEXT:  # %bb.63: # %cond.load91
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_64: # %else92
+; AVXNC-NEXT:    retq
   %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
   ret <32 x bfloat> %1
 }
@@ -1173,14 +1658,806 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr62997_3:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vmovd %xmm1, %eax
-; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
-; BF16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; BF16-NEXT:    retq
+; F16-LABEL: pr62997_3:
+; F16:       # %bb.0:
+; F16-NEXT:    vmovd %xmm1, %eax
+; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; F16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr62997_3:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vmovd %xmm2, %eax
+; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:    retq
   %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
   ret <32 x bfloat> %3
 }
 
 declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
+
+define <4 x float> @pr64460_1(<4 x bfloat> %a) {
+; SSE2-LABEL: pr64460_1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pextrw $3, %xmm0, %eax
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: pr64460_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    retq
+  %b = fpext <4 x bfloat> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <8 x float> @pr64460_2(<8 x bfloat> %a) {
+; SSE2-LABEL: pr64460_2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    shrq $32, %rsi
+; SSE2-NEXT:    movl %edx, %edi
+; SSE2-NEXT:    andl $-65536, %edi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movl %edx, %edi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %rdx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    andl $-65536, %edx # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: pr64460_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpslld $16, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %b = fpext <8 x bfloat> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @pr64460_3(<16 x bfloat> %a) {
+; SSE2-LABEL: pr64460_3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
+; SSE2-NEXT:    movq %xmm1, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    shrq $32, %rdx
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    shrq $32, %r8
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    shrq $32, %r10
+; SSE2-NEXT:    movl %r9d, %r11d
+; SSE2-NEXT:    andl $-65536, %r11d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r11d, %xmm1
+; SSE2-NEXT:    movl %r9d, %r11d
+; SSE2-NEXT:    shll $16, %r11d
+; SSE2-NEXT:    movd %r11d, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %r9
+; SSE2-NEXT:    shll $16, %r9d
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    shll $16, %r10d
+; SSE2-NEXT:    movd %r10d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movl %edi, %r9d
+; SSE2-NEXT:    andl $-65536, %r9d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movl %edi, %r9d
+; SSE2-NEXT:    shll $16, %r9d
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %rdi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shll $16, %r8d
+; SSE2-NEXT:    movd %r8d, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    movl %esi, %edi
+; SSE2-NEXT:    andl $-65536, %edi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edi, %xmm3
+; SSE2-NEXT:    movl %esi, %edi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    shrq $48, %rsi
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm3
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    andl $-65536, %edx # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_3:
+; F16:       # %bb.0:
+; F16-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; F16-NEXT:    vpslld $16, %zmm0, %zmm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_3:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXNC-NEXT:    vpslld $16, %ymm1, %ymm2
+; AVXNC-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXNC-NEXT:    vpslld $16, %ymm0, %ymm1
+; AVXNC-NEXT:    vmovdqa %ymm2, %ymm0
+; AVXNC-NEXT:    retq
+  %b = fpext <16 x bfloat> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @pr64460_4(<8 x bfloat> %a) {
+; SSE2-LABEL: pr64460_4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %rdx, %rcx
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    movq %rsi, %rdi
+; SSE2-NEXT:    shrq $32, %rdi
+; SSE2-NEXT:    movq %rsi, %r8
+; SSE2-NEXT:    shrq $48, %r8
+; SSE2-NEXT:    movl %esi, %r9d
+; SSE2-NEXT:    andl $-65536, %r9d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r9d, %xmm0
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm1
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shll $16, %r8d
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    cvtss2sd %xmm1, %xmm2
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    cvtss2sd %xmm1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    andl $-65536, %esi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    cvtss2sd %xmm2, %xmm3
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    cvtss2sd %xmm2, %xmm2
+; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    cvtss2sd %xmm3, %xmm4
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    cvtss2sd %xmm3, %xmm3
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_4:
+; F16:       # %bb.0:
+; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; F16-NEXT:    vpslld $16, %ymm0, %ymm0
+; F16-NEXT:    vcvtps2pd %ymm0, %zmm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_4:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVXNC-NEXT:    vmovaps %ymm2, %ymm0
+; AVXNC-NEXT:    retq
+  %b = fpext <8 x bfloat> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $32, %rsp
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pinsrw $0, %eax, %xmm0
+; SSE2-NEXT:    pinsrw $0, %r14d, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pinsrw $0, %ebp, %xmm0
+; SSE2-NEXT:    pinsrw $0, %ebx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $32, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v4f32:
+; F16:       # %bb.0:
+; F16-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v4f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    retq
+  %b = fptrunc <4 x float> %a to <4 x bfloat>
+  ret <4 x bfloat> %b
+}
+
+define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $32, %rsp
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %rbx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    addq $32, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v8f32:
+; F16:       # %bb.0:
+; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v8f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    retq
+  %b = fptrunc <8 x float> %a to <8 x bfloat>
+  ret <8 x bfloat> %b
+}
+
+define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $64, %rsp
+; SSE2-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r15d
+; SSE2-NEXT:    orl %ebp, %r15d
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    shlq $32, %r14
+; SSE2-NEXT:    orq %r15, %r14
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r12d
+; SSE2-NEXT:    orl %ebp, %r12d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r15d
+; SSE2-NEXT:    orl %ebp, %r15d
+; SSE2-NEXT:    shlq $32, %r15
+; SSE2-NEXT:    orq %r12, %r15
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r12d
+; SSE2-NEXT:    orl %ebp, %r12d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r12, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %r14, %xmm2
+; SSE2-NEXT:    movq %rbx, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    addq $64, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v16f32:
+; F16:       # %bb.0:
+; F16-NEXT:    vcvtneps2bf16 %zmm0, %ymm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v16f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    movq %rsp, %rbp
+; AVXNC-NEXT:    andq $-32, %rsp
+; AVXNC-NEXT:    subq $64, %rsp
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
+; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
+; AVXNC-NEXT:    movq %rbp, %rsp
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
+  %b = fptrunc <16 x float> %a to <16 x bfloat>
+  ret <16 x bfloat> %b
+}
+
+define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
+; SSE2-LABEL: fptrunc_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $64, %rsp
+; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %rbx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    addq $64, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v8f64:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbp
+; F16-NEXT:    pushq %r15
+; F16-NEXT:    pushq %r14
+; F16-NEXT:    pushq %r13
+; F16-NEXT:    pushq %r12
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    subq $136, %rsp
+; F16-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; F16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %ebp
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r14d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r15d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r12d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r13d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %ebx
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    addq $136, %rsp
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    popq %r12
+; F16-NEXT:    popq %r13
+; F16-NEXT:    popq %r14
+; F16-NEXT:    popq %r15
+; F16-NEXT:    popq %rbp
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v8f64:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    pushq %r15
+; AVXNC-NEXT:    pushq %r14
+; AVXNC-NEXT:    pushq %r13
+; AVXNC-NEXT:    pushq %r12
+; AVXNC-NEXT:    pushq %rbx
+; AVXNC-NEXT:    subq $120, %rsp
+; AVXNC-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %ebp
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r14d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r15d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r12d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r13d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %ebx
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    addq $120, %rsp
+; AVXNC-NEXT:    popq %rbx
+; AVXNC-NEXT:    popq %r12
+; AVXNC-NEXT:    popq %r13
+; AVXNC-NEXT:    popq %r14
+; AVXNC-NEXT:    popq %r15
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
+  %b = fptrunc <8 x double> %a to <8 x bfloat>
+  ret <8 x bfloat> %b
+}
diff --git a/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir b/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir
new file mode 100644
index 0000000000000..4bfd749fb7723
--- /dev/null
+++ b/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir
@@ -0,0 +1,141 @@
+# RUN: llc -o - %s -mtriple=x86_64-pc-windows-msvc -run-pass=machinelicm | FileCheck %s
+#
+# This test checks that MachineLICM doesn't hoist loads out of funclets.
+# Manually modified from the IR of the following C++ function by running
+# llc -stop-after=machine-cp.
+#
+# void may_throw();
+# void use(int);
+#
+# void test(int n, int arg)
+# {
+#    for (int i = 0 ; i < n ; i++)
+#        try {
+#            may_throw();
+#        }
+#        catch (...) {
+#            // Two uses to get 'arg' allocated to a register
+#            use(arg);
+#            use(arg);
+#        }
+# }
+
+--- |
+  target triple = "x86_64-pc-windows-msvc"
+
+  define void @test(i32 %n, i32 %arg) personality ptr @__CxxFrameHandler3 {
+  entry:
+    %cmp3 = icmp sgt i32 %n, 0
+    br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.inc, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.inc
+    %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.inc ]
+    invoke void @may_throw()
+            to label %for.inc unwind label %catch.dispatch
+
+  catch.dispatch:                                   ; preds = %for.body
+    %0 = catchswitch within none [label %catch] unwind to caller
+
+  catch:                                            ; preds = %catch.dispatch
+    %1 = catchpad within %0 [ptr null, i32 64, ptr null]
+    call void @use(i32 %arg) [ "funclet"(token %1) ]
+    call void @use(i32 %arg) [ "funclet"(token %1) ]
+    catchret from %1 to label %for.inc
+
+  for.inc:                                          ; preds = %catch, %for.body
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  declare i32 @__CxxFrameHandler3(...)
+
+  declare void @may_throw()
+
+  declare void @use(i32)
+
+...
+---
+name:            test
+alignment:       16
+tracksRegLiveness: true
+hasEHCatchret:   true
+hasEHScopes:     true
+hasEHFunclets:   true
+debugInstrRef:   true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$ecx' }
+  - { reg: '$edx' }
+frameInfo:
+  maxAlignment:    8
+  hasCalls:        true
+  hasOpaqueSPAdjustment: true
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $ecx, $edx
+
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, $edx :: (store (s32) into %stack.1)
+    TEST32rr renamable $ecx, renamable $ecx, implicit-def $eflags
+    JCC_1 %bb.2, 14, implicit killed $eflags
+
+  bb.1:
+    liveins: $ecx
+
+    JMP_1 %bb.3
+
+  bb.2.for.cond.cleanup:
+    RET 0
+
+  bb.3.for.body:
+    successors: %bb.5, %bb.4
+    liveins: $ecx
+
+    EH_LABEL <mcsymbol .Leh1>
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $ecx :: (store (s32) into %stack.0)
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @may_throw, csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    EH_LABEL <mcsymbol .Leh2>
+    JMP_1 %bb.5
+
+  bb.4.catch (landing-pad, ehfunclet-entry):
+    successors: %bb.5
+
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    renamable $esi = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    $ecx = COPY renamable $esi
+    CALL64pcrel32 @use, csr_win64, implicit $rsp, implicit $ssp, implicit $ecx, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $ecx = COPY killed renamable $esi
+    CALL64pcrel32 @use, csr_win64, implicit $rsp, implicit $ssp, implicit $ecx, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CATCHRET %bb.5, %bb.0
+
+  bb.5.for.inc:
+    successors: %bb.2, %bb.3
+
+    renamable $ecx = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    renamable $ecx = DEC32r killed renamable $ecx, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit killed $eflags
+    JMP_1 %bb.3
+
+...
+#
+# CHECK: bb.4.catch
+# CHECK: ADJCALLSTACKDOWN64
+# CHECK-NEXT: renamable [[REG:\$[a-z0-9]+]] = MOV32rm %stack.1
+# CHECK-NEXT: $ecx = COPY renamable [[REG]]
+# CHECK-NEXT: CALL64pcrel32 @use
diff --git a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
index e3f3622f146d9..33250b3495a00 100644
--- a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
+++ b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
@@ -1,6 +1,6 @@
 ; Check that if option prefer-no-gather/scatter can disable gather/scatter instructions.
-; RUN: llc -mattr=+avx2,+fast-gather %s -o - | FileCheck %s --check-prefixes=GATHER
-; RUN: llc -mattr=+avx2,+fast-gather,+prefer-no-gather %s -o - | FileCheck %s --check-prefixes=NO-GATHER
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather %s -o - | FileCheck %s --check-prefixes=GATHER
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather,+prefer-no-gather %s -o - | FileCheck %s --check-prefixes=NO-GATHER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq < %s | FileCheck %s --check-prefix=SCATTER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq,+prefer-no-gather < %s | FileCheck %s --check-prefix=SCATTER-NO-GATHER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq,+prefer-no-scatter < %s | FileCheck %s --check-prefix=GATHER-NO-SCATTER
diff --git a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
index fa8a4a60eac14..f08068420406d 100644
--- a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
+++ b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
@@ -282,6 +282,29 @@ else:
   ret i1 false
 }
 
+define i1 @mul_nsw_decomp(i128 %x) {
+; CHECK-LABEL: @mul_nsw_decomp(
+; CHECK-NEXT:    [[VAL:%.*]] = mul nsw i128 [[X:%.*]], 9223372036854775808
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i128 [[X]], [[VAL]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i128 [[X]], 0
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %val = mul nsw i128 %x, 9223372036854775808
+  %cmp = icmp sgt i128 %x, %val
+  br i1 %cmp, label %then, label %else
+
+then:
+  %cmp2 = icmp sgt i128 %x, 0
+  ret i1 %cmp2
+
+else:
+  ret i1 false
+}
+
 define i1 @add_nuw_decomp_recursive() {
 ; CHECK-LABEL: @add_nuw_decomp_recursive(
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 -9223372036854775808, 10
diff --git a/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
new file mode 100644
index 0000000000000..27191d6f54c2d
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+
+declare void @set_value(ptr)
+
+declare void @bar()
+
+define void @foo(i1 %0) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i1 [[TMP0:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[V:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @set_value(ptr [[V]])
+; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[V]], align 8
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB0:%.*]], label [[BB2:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[L1]], 0
+; CHECK-NEXT:    br i1 [[C1]], label [[BB2_THREAD:%.*]], label [[BB2]]
+; CHECK:       bb2.thread:
+; CHECK-NEXT:    store i64 0, ptr [[V]], align 8
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], [[BB0]] ], [ [[L1]], [[START:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[L2]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB3:%.*]], label [[BB4]]
+; CHECK:       bb3:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    ret void
+; CHECK:       bb4:
+; CHECK-NEXT:    ret void
+;
+start:
+  %v = alloca i64, align 8
+  call void @set_value(ptr %v)
+  %l1 = load i64, ptr %v, align 8, !range !0
+  br i1 %0, label %bb0, label %bb2
+
+bb0:                                              ; preds = %start
+  %c1 = icmp eq i64 %l1, 0
+  br i1 %c1, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb0
+  store i64 0, ptr %v, align 8
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb0, %start
+  %l2 = load i64, ptr %v, align 8
+  %1 = icmp eq i64 %l2, 2
+  br i1 %1, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb2
+  call void @bar()
+  ret void
+
+bb4:                                              ; preds = %bb2
+  ret void
+}
+
+!0 = !{i64 0, i64 2}
diff --git a/llvm/test/Transforms/SROA/scalable-vector-struct.ll b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
index 92cd44d2b5ac3..1af4fbbd9254b 100644
--- a/llvm/test/Transforms/SROA/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
@@ -20,3 +20,34 @@ define %struct.test @alloca(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y) {
   %val = load %struct.test, %struct.test* %addr, align 4
   ret %struct.test %val
 }
+
+
+define { <vscale x 2 x i32>, <vscale x 2 x i32> } @return_tuple(<vscale x 2 x i32> %v_tuple.coerce0, <vscale x 2 x i32> %v_tuple.coerce1) {
+; CHECK-LABEL: @return_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], 1
+; CHECK-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+; CHECK-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+; CHECK-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+; CHECK-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+;
+entry:
+  %v_tuple = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %v_tuple.addr = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %coerce = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %v_tuple.coerce0, 0
+  %1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %0, <vscale x 2 x i32> %v_tuple.coerce1, 1
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %1, ptr %v_tuple, align 4
+  %v_tuple1 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %v_tuple1, ptr %v_tuple.addr, align 4
+  %2 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple.addr, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %2, ptr %coerce, align 4
+  %coerce.tuple = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %coerce, align 4
+  %coerce.extract0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 0
+  %coerce.extract1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 1
+  %call = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> %coerce.extract0, <vscale x 2 x i32> %coerce.extract1)
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %call
+}
+
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32>, <vscale x 2 x i32>)
diff --git a/llvm/test/tools/llvm-rc/windres-preproc.test b/llvm/test/tools/llvm-rc/windres-preproc.test
index 888be03f7d9e4..e55195b3a4d28 100644
--- a/llvm/test/tools/llvm-rc/windres-preproc.test
+++ b/llvm/test/tools/llvm-rc/windres-preproc.test
@@ -4,6 +4,7 @@
 ; REQUIRES: shell
 
 ; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\\\"foo bar\\\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\\\"baz baz\\\"" -DFOO5=\"bar\" %p/Inputs/empty.rc %t.res | FileCheck %s --check-prefix=CHECK1
+; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\"foo bar\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\"baz baz\"" "-DFOO5=bar" %p/Inputs/empty.rc %t.res --use-temp-file | FileCheck %s --check-prefix=CHECK1
 ; CHECK1: {{^}} "clang" "--driver-mode=gcc" "-target" "{{.*}}-{{.*}}{{mingw32|windows-gnu}}" "-E" "-xc" "-DRC_INVOKED" "{{.*}}empty.rc" "-o" "{{.*}}preproc-{{.*}}.rc" "-I" "{{.*}}incdir1" "-I" "{{.*}}incdir2" "-D" "FOO1=\"foo bar\"" "-U" "FOO2" "-D" "FOO3" "-DFOO4=\"baz baz\"" "-D" "FOO5=bar"{{$}}
 ; RUN: llvm-windres -### --preprocessor "i686-w64-mingw32-gcc -E -DFOO=\\\"foo\\ bar\\\"" %p/Inputs/empty.rc %t.res | FileCheck %s --check-prefix=CHECK2
 ; CHECK2: {{^}} "i686-w64-mingw32-gcc" "-E" "-DFOO=\"foo bar\"" "{{.*}}empty.rc" "-o" "{{.*}}preproc-{{.*}}.rc"{{$}}
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index e690089fec565..b3d40800aef63 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -1083,11 +1083,11 @@ ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const {
 #define MAP_FIXED_NOREPLACE MAP_FIXED
 #endif
 
-// 32 bit ARM doesn't have mmap and uses mmap2 instead. The only difference
-// between the two syscalls is that mmap2's offset parameter is in terms 4096
-// byte offsets rather than individual bytes, so for our purposes they are
-// effectively the same as all ofsets here are set to 0.
-#ifdef __arm__
+// Some 32-bit architectures don't have mmap and define mmap2 instead. The only
+// difference between the two syscalls is that mmap2's offset parameter is in
+// terms 4096 byte offsets rather than individual bytes, so for our purposes
+// they are effectively the same as all ofsets here are set to 0.
+#if defined(SYS_mmap2) && !defined(SYS_mmap)
 #define SYS_mmap SYS_mmap2
 #endif
 
diff --git a/llvm/tools/llvm-rc/WindresOpts.td b/llvm/tools/llvm-rc/WindresOpts.td
index 3c75c85ece0f6..42a56dbfda4cd 100644
--- a/llvm/tools/llvm-rc/WindresOpts.td
+++ b/llvm/tools/llvm-rc/WindresOpts.td
@@ -48,6 +48,10 @@ defm codepage : LongShort<"c", "codepage", "Default codepage to use">;
 
 defm language : LongShort<"l", "language", "Default language to use (0x0-0xffff)">;
 
+def use_temp_file: Flag<["--"], "use-temp-file">,
+                   HelpText<"Mimic GNU windres preprocessor option handling "
+                            "(don't unescape preprocessor options)">;
+
 defm verbose : F<"v", "verbose", "Enable verbose output">;
 defm version : F<"V", "version", "Display version">;
 
@@ -57,6 +61,3 @@ defm help : F<"h", "help", "Display this message and exit">;
 def _HASH_HASH_HASH : Flag<["-"], "###">;
 
 def no_preprocess : Flag<["--"], "no-preprocess">;
-
-// Unimplemented options for compatibility
-def use_temp_file: Flag<["--"], "use-temp-file">;
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index 4a77f4bd88cce..0caa8117cb70b 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -142,20 +142,24 @@ ErrorOr<std::string> findClang(const char *Argv0, StringRef Triple) {
   if (MainExecPath.empty())
     MainExecPath = Argv0;
 
-  StringRef Parent = llvm::sys::path::parent_path(MainExecPath);
   ErrorOr<std::string> Path = std::error_code();
   std::string TargetClang = (Triple + "-clang").str();
   std::string VersionedClang = ("clang-" + Twine(LLVM_VERSION_MAJOR)).str();
-  if (!Parent.empty()) {
-    // First look for the tool with all potential names in the specific
-    // directory of Argv0, if known
-    for (const auto *Name :
-         {TargetClang.c_str(), VersionedClang.c_str(), "clang", "clang-cl"}) {
+  for (const auto *Name :
+       {TargetClang.c_str(), VersionedClang.c_str(), "clang", "clang-cl"}) {
+    for (const StringRef Parent :
+         {llvm::sys::path::parent_path(MainExecPath),
+          llvm::sys::path::parent_path(Argv0)}) {
+      // Look for various versions of "clang" first in the MainExecPath parent
+      // directory and then in the argv[0] parent directory.
+      // On Windows (but not Unix) argv[0] is overwritten with the eqiuvalent
+      // of MainExecPath by InitLLVM.
       Path = sys::findProgramByName(Name, Parent);
       if (Path)
         return Path;
     }
   }
+
   // If no parent directory known, or not found there, look everywhere in PATH
   for (const auto *Name : {"clang", "clang-cl"}) {
     Path = sys::findProgramByName(Name);
@@ -469,7 +473,14 @@ RcOptions parseWindresOptions(ArrayRef<const char *> ArgsArr,
     // done this double escaping) probably is confined to cases like these
     // quoted string defines, and those happen to work the same across unix
     // and windows.
-    std::string Unescaped = unescape(Arg->getValue());
+    //
+    // If GNU windres is executed with --use-temp-file, it doesn't use
+    // popen() to invoke the preprocessor, but uses another function which
+    // actually preserves tricky characters better. To mimic this behaviour,
+    // don't unescape arguments here.
+    std::string Value = Arg->getValue();
+    if (!InputArgs.hasArg(WINDRES_use_temp_file))
+      Value = unescape(Value);
     switch (Arg->getOption().getID()) {
     case WINDRES_include_dir:
       // Technically, these are handled the same way as e.g. defines, but
@@ -483,17 +494,19 @@ RcOptions parseWindresOptions(ArrayRef<const char *> ArgsArr,
       break;
     case WINDRES_define:
       Opts.PreprocessArgs.push_back("-D");
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     case WINDRES_undef:
       Opts.PreprocessArgs.push_back("-U");
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     case WINDRES_preprocessor_arg:
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     }
   }
+  // TODO: If --use-temp-file is set, we shouldn't be unescaping
+  // the --preprocessor argument either, only splitting it.
   if (InputArgs.hasArg(WINDRES_preprocessor))
     Opts.PreprocessCmd =
         unescapeSplit(InputArgs.getLastArgValue(WINDRES_preprocessor));
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index aa5d525f24eb7..c001c693cc146 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -635,11 +635,11 @@ TEST_F(X86Core2TargetTest, GenerateExitSyscallTest) {
 #define MAP_FIXED_NOREPLACE MAP_FIXED
 #endif
 
-// 32 bit ARM doesn't have mmap and uses mmap2 instead. The only difference
-// between the two syscalls is that mmap2's offset parameter is in terms 4096
-// byte offsets rather than individual bytes, so for our purposes they are
-// effectively the same as all ofsets here are set to 0.
-#ifdef __arm__
+// Some 32-bit architectures don't have mmap and define mmap2 instead. The only
+// difference between the two syscalls is that mmap2's offset parameter is in
+// terms 4096 byte offsets rather than individual bytes, so for our purposes
+// they are effectively the same as all ofsets here are set to 0.
+#if defined(SYS_mmap2) && !defined(SYS_mmap)
 #define SYS_mmap SYS_mmap2
 #endif
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 8428d4ba991ef..8fe5ef35a7603 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -383,7 +383,7 @@ Example output is shown below:
 ```
 //===-------------------------------------------===//
 Processing operation : 'cf.cond_br'(0x60f000001120) {
-  "cf.cond_br"(%arg0)[^bb2, ^bb2] {operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+  "cf.cond_br"(%arg0)[^bb2, ^bb2] {operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 
   * Pattern SimplifyConstCondBranchPred : 'cf.cond_br -> ()' {
   } -> failure : pattern failed to match
diff --git a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
index dfa97c865118f..9f15ca767abf9 100644
--- a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
+++ b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
@@ -223,6 +223,18 @@ def IRDL_OperandsOp : IRDL_Op<"operands", [HasParent<"OperationOp">]> {
 
     The `mul` operation will expect two operands of type `cmath.complex`, that
     have the same type, and return a result of the same type.
+
+    The operands can also be marked as variadic or optional:
+    ```mlir
+    irdl.operands(%0, single %1, optional %2, variadic %3)
+    ```
+    
+    Here, %0 and %1 are required single operands, %2 is an optional operand,
+    and %3 is a variadic operand.
+
+    When more than one operand is marked as optional or variadic, the operation
+    will expect a 'operandSegmentSizes' attribute that defines the number of
+    operands in each segment.
   }];
 
   let arguments = (ins Variadic<IRDL_AttributeType>:$args);
@@ -254,6 +266,18 @@ def IRDL_ResultsOp : IRDL_Op<"results", [HasParent<"OperationOp">]> {
 
     The operation will expect one operand of the `cmath.complex` type, and two
     results that have the underlying type of the `cmath.complex`.
+
+    The results can also be marked as variadic or optional:
+    ```mlir
+    irdl.results(%0, single %1, optional %2, variadic %3)
+    ```
+    
+    Here, %0 and %1 are required single results, %2 is an optional result,
+    and %3 is a variadic result.
+
+    When more than one result is marked as optional or variadic, the operation
+    will expect a 'resultSegmentSizes' attribute that defines the number of
+    results in each segment.
   }];
 
   let arguments = (ins Variadic<IRDL_AttributeType>:$args);
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 1efd2b6b63dd9..4567b3f1902d7 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -874,23 +874,6 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
       return cast<DestinationStyleOpInterface>(*this->getOperation())
           .hasTensorSemantics();
     }
-
-    //========================================================================//
-    // Helper functions to mutate the `operand_segment_sizes` attribute.
-    // These are useful when cloning and changing operand types.
-    //========================================================================//
-    void setNumInputs(unsigned num) { setOperandSegmentAt(0, num); }
-    void setNumOutputBuffers(unsigned num) { setOperandSegmentAt(1, num); }
-
-    private:
-    void setOperandSegmentAt(unsigned idx, unsigned val) {
-      auto attr = ::llvm::cast<DenseIntElementsAttr>(
-                      (*this)->getAttr("operand_segment_sizes"));
-      unsigned i = 0;
-      auto newAttr = attr.mapValues(IntegerType::get(getContext(), 32),
-        [&](const APInt &v) { return (i++ == idx) ? APInt(32, val) : v; });
-      getOperation()->setAttr("operand_segment_sizes", newAttr);
-    }
   }];
 
   let verify = [{ return detail::verifyStructuredOpInterface($_op); }];
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 274a531f4061e..f25106b1593a3 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -2178,7 +2178,7 @@ def SameVariadicOperandSize : GenInternalOpTrait<"SameVariadicOperandSize">;
 // to have the same array size.
 def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
 
-// Uses an attribute named `operand_segment_sizes` to specify how many actual
+// Uses an attribute named `operandSegmentSizes` to specify how many actual
 // operand each ODS-declared operand (variadic or not) corresponds to.
 // This trait is used for ops that have multiple variadic operands but do
 // not know statically their size relationship. The attribute must be a 1D
@@ -2188,7 +2188,7 @@ def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
 def AttrSizedOperandSegments :
   NativeOpTrait<"AttrSizedOperandSegments">, StructuralOpTrait;
 // Similar to AttrSizedOperandSegments, but used for results. The attribute
-// should be named as `result_segment_sizes`.
+// should be named as `resultSegmentSizes`.
 def AttrSizedResultSegments  :
   NativeOpTrait<"AttrSizedResultSegments">, StructuralOpTrait;
 
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index d42bffaf32b03..afbd0395b466a 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -1331,7 +1331,7 @@ struct HasParent {
 /// relationship is not always known statically. For such cases, we need
 /// a per-op-instance specification to divide the operands into logical groups
 /// or segments. This can be modeled by attributes. The attribute will be named
-/// as `operand_segment_sizes`.
+/// as `operandSegmentSizes`.
 ///
 /// This trait verifies the attribute for specifying operand segments has
 /// the correct type (1D vector) and values (non-negative), etc.
@@ -1339,9 +1339,7 @@ template <typename ConcreteType>
 class AttrSizedOperandSegments
     : public TraitBase<ConcreteType, AttrSizedOperandSegments> {
 public:
-  static StringRef getOperandSegmentSizeAttr() {
-    return "operand_segment_sizes";
-  }
+  static StringRef getOperandSegmentSizeAttr() { return "operandSegmentSizes"; }
 
   static LogicalResult verifyTrait(Operation *op) {
     return ::mlir::OpTrait::impl::verifyOperandSizeAttr(
@@ -1354,7 +1352,7 @@ template <typename ConcreteType>
 class AttrSizedResultSegments
     : public TraitBase<ConcreteType, AttrSizedResultSegments> {
 public:
-  static StringRef getResultSegmentSizeAttr() { return "result_segment_sizes"; }
+  static StringRef getResultSegmentSizeAttr() { return "resultSegmentSizes"; }
 
   static LogicalResult verifyTrait(Operation *op) {
     return ::mlir::OpTrait::impl::verifyResultSizeAttr(
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 0eeb8bb1ec8da..2131fe313f8c5 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -715,18 +715,20 @@ class AsmParser {
   //===--------------------------------------------------------------------===//
 
   /// This class represents a StringSwitch like class that is useful for parsing
-  /// expected keywords. On construction, it invokes `parseKeyword` and
-  /// processes each of the provided cases statements until a match is hit. The
-  /// provided `ResultT` must be assignable from `failure()`.
+  /// expected keywords. On construction, unless a non-empty keyword is
+  /// provided, it invokes `parseKeyword` and processes each of the provided
+  /// cases statements until a match is hit. The provided `ResultT` must be
+  /// assignable from `failure()`.
   template <typename ResultT = ParseResult>
   class KeywordSwitch {
   public:
-    KeywordSwitch(AsmParser &parser)
+    KeywordSwitch(AsmParser &parser, StringRef *keyword = nullptr)
         : parser(parser), loc(parser.getCurrentLocation()) {
-      if (failed(parser.parseKeywordOrCompletion(&keyword)))
+      if (keyword && !keyword->empty())
+        this->keyword = *keyword;
+      else if (failed(parser.parseKeywordOrCompletion(&this->keyword)))
         result = failure();
     }
-
     /// Case that uses the provided value when true.
     KeywordSwitch &Case(StringLiteral str, ResultT value) {
       return Case(str, [&](StringRef, SMLoc) { return std::move(value); });
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index f3a79eb52f8ec..adae3560570dd 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -555,7 +555,7 @@ class RegisteredOperationName : public OperationName {
                                              StringRef name) final {
       if constexpr (hasProperties) {
         auto concreteOp = cast<ConcreteOp>(op);
-        return ConcreteOp::getInherentAttr(concreteOp.getContext(),
+        return ConcreteOp::getInherentAttr(concreteOp->getContext(),
                                            concreteOp.getProperties(), name);
       }
       // If the op does not have support for properties, we dispatch back to the
@@ -576,7 +576,7 @@ class RegisteredOperationName : public OperationName {
     void populateInherentAttrs(Operation *op, NamedAttrList &attrs) final {
       if constexpr (hasProperties) {
         auto concreteOp = cast<ConcreteOp>(op);
-        ConcreteOp::populateInherentAttrs(concreteOp.getContext(),
+        ConcreteOp::populateInherentAttrs(concreteOp->getContext(),
                                           concreteOp.getProperties(), attrs);
       }
     }
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 75fe1524221c1..d5f1ea0fe0350 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -172,6 +172,10 @@ class OpPassManager {
   /// if a pass manager has already been initialized.
   LogicalResult initialize(MLIRContext *context, unsigned newInitGeneration);
 
+  /// Compute a hash of the pipeline, so that we can detect changes (a pass is
+  /// added...).
+  llvm::hash_code hash();
+
   /// A pointer to an internal implementation instance.
   std::unique_ptr<detail::OpPassManagerImpl> impl;
 
@@ -439,9 +443,11 @@ class PassManager : public OpPassManager {
   /// generate reproducers.
   std::unique_ptr<detail::PassCrashReproducerGenerator> crashReproGenerator;
 
-  /// A hash key used to detect when reinitialization is necessary.
+  /// Hash keys used to detect when reinitialization is necessary.
   llvm::hash_code initializationKey =
       DenseMapInfo<llvm::hash_code>::getTombstoneKey();
+  llvm::hash_code pipelineInitializationKey =
+      DenseMapInfo<llvm::hash_code>::getTombstoneKey();
 
   /// Flag that specifies if pass timing is enabled.
   bool passTiming : 1;
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 971d2819ade44..c755dc12a311b 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -1675,28 +1675,28 @@ py::object PyOpView::buildGeneric(
     } else {
       attributes = py::dict();
     }
-    if (attributes->contains("result_segment_sizes") ||
-        attributes->contains("operand_segment_sizes")) {
-      throw py::value_error("Manually setting a 'result_segment_sizes' or "
-                            "'operand_segment_sizes' attribute is unsupported. "
+    if (attributes->contains("resultSegmentSizes") ||
+        attributes->contains("operandSegmentSizes")) {
+      throw py::value_error("Manually setting a 'resultSegmentSizes' or "
+                            "'operandSegmentSizes' attribute is unsupported. "
                             "Use Operation.create for such low-level access.");
     }
 
-    // Add result_segment_sizes attribute.
+    // Add resultSegmentSizes attribute.
     if (!resultSegmentLengths.empty()) {
       MlirAttribute segmentLengthAttr =
           mlirDenseI32ArrayGet(context->get(), resultSegmentLengths.size(),
                                resultSegmentLengths.data());
-      (*attributes)["result_segment_sizes"] =
+      (*attributes)["resultSegmentSizes"] =
           PyAttribute(context, segmentLengthAttr);
     }
 
-    // Add operand_segment_sizes attribute.
+    // Add operandSegmentSizes attribute.
     if (!operandSegmentLengths.empty()) {
       MlirAttribute segmentLengthAttr =
           mlirDenseI32ArrayGet(context->get(), operandSegmentLengths.size(),
                                operandSegmentLengths.data());
-      (*attributes)["operand_segment_sizes"] =
+      (*attributes)["operandSegmentSizes"] =
           PyAttribute(context, segmentLengthAttr);
     }
   }
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 9dfe07797ff4b..e6154a329aacc 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -53,7 +53,7 @@ enum class DataArgAction : unsigned char {
 
 // Fix up the fact that, when we're migrating from a general bugffer atomic
 // to a load or to a CAS, the number of openrands, and thus the number of
-// entries needed in operand_segment_sizes, needs to change. We use this method
+// entries needed in operandSegmentSizes, needs to change. We use this method
 // because we'd like to preserve unknown attributes on the atomic instead of
 // discarding them.
 static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
@@ -61,7 +61,7 @@ static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
                                      DataArgAction action) {
   newAttrs.reserve(attrs.size());
   for (NamedAttribute attr : attrs) {
-    if (attr.getName().getValue() != "operand_segment_sizes") {
+    if (attr.getName().getValue() != "operandSegmentSizes") {
       newAttrs.push_back(attr);
       continue;
     }
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 7d018bf8f3a3d..abe6670c7f855 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -61,7 +61,7 @@ YieldOp::getMutableSuccessorOperands(std::optional<unsigned> index) {
 /// ExecuteOp
 //===----------------------------------------------------------------------===//
 
-constexpr char kOperandSegmentSizesAttr[] = "operand_segment_sizes";
+constexpr char kOperandSegmentSizesAttr[] = "operandSegmentSizes";
 
 OperandRange
 ExecuteOp::getSuccessorEntryOperands(std::optional<unsigned> index) {
@@ -100,7 +100,7 @@ void ExecuteOp::build(OpBuilder &builder, OperationState &result,
   result.addOperands(dependencies);
   result.addOperands(operands);
 
-  // Add derived `operand_segment_sizes` attribute based on parsed operands.
+  // Add derived `operandSegmentSizes` attribute based on parsed operands.
   int32_t numDependencies = dependencies.size();
   int32_t numOperands = operands.size();
   auto operandSegmentSizes =
@@ -208,7 +208,7 @@ ParseResult ExecuteOp::parse(OpAsmParser &parser, OperationState &result) {
 
   int32_t numOperands = valueArgs.size();
 
-  // Add derived `operand_segment_sizes` attribute based on parsed operands.
+  // Add derived `operandSegmentSizes` attribute based on parsed operands.
   auto operandSegmentSizes =
       parser.getBuilder().getDenseI32ArrayAttr({numDependencies, numOperands});
   result.addAttribute(kOperandSegmentSizesAttr, operandSegmentSizes);
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index d6778ed72c7d0..4f5452b27e3e0 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -170,7 +170,7 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state,
   state.addTypes(derivedResultTypes);
   state.addAttributes(attributes);
   state.addAttribute(
-      "operand_segment_sizes",
+      "operandSegmentSizes",
       b.getDenseI32ArrayAttr({static_cast<int32_t>(inputs.size()),
                               static_cast<int32_t>(outputs.size())}));
 
@@ -226,18 +226,18 @@ parseCommonStructuredOpParts(OpAsmParser &parser, OperationState &result,
     // This is a bit complex because we're trying to be backward compatible with
     // operation syntax that mix the inherent attributes and the discardable
     // ones in the same dictionary. If the properties are used, we append the
-    // operand_segment_sizes there directly. Otherwise we append it to the
+    // operandSegmentSizes there directly. Otherwise we append it to the
     // discardable attributes dictionary where it is handled by the generic
     // Operation::create(...) method.
     if (result.propertiesAttr) {
       NamedAttrList attrs = llvm::cast<DictionaryAttr>(result.propertiesAttr);
-      attrs.append("operand_segment_sizes",
+      attrs.append("operandSegmentSizes",
                    parser.getBuilder().getDenseI32ArrayAttr(
                        {static_cast<int32_t>(inputsOperands.size()),
                         static_cast<int32_t>(outputsOperands.size())}));
       result.propertiesAttr = attrs.getDictionary(parser.getContext());
     } else {
-      result.addAttribute("operand_segment_sizes",
+      result.addAttribute("operandSegmentSizes",
                           parser.getBuilder().getDenseI32ArrayAttr(
                               {static_cast<int32_t>(inputsOperands.size()),
                                static_cast<int32_t>(outputsOperands.size())}));
@@ -332,7 +332,7 @@ static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op,
                                    ValueRange inputs, ValueRange outputs) {
   p.printOptionalAttrDict(
       op->getAttrs(),
-      /*elidedAttrs=*/{"operand_segment_sizes",
+      /*elidedAttrs=*/{"operandSegmentSizes",
                        // See generated code in
                        // LinalgNamedStructuredOps.yamlgen.cpp.inc
                        "linalg.memoized_indexing_maps"});
@@ -878,7 +878,7 @@ void GenericOp::print(OpAsmPrinter &p) {
   printCommonStructuredOpParts(p, SmallVector<Value>(getDpsInputOperands()),
                                SmallVector<Value>(getDpsInitOperands()));
 
-  genericAttrNames.push_back("operand_segment_sizes");
+  genericAttrNames.push_back("operandSegmentSizes");
   genericAttrNamesSet.insert(genericAttrNames.back());
 
   bool hasExtraAttrs = false;
diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index ee8f23cf362b6..98c97fdc2c090 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -168,11 +168,26 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
   Value operandA = op.getOperand(0);
   Value operandB = op.getOperand(1);
   Type opType = operandA.getType();
+  Value zero = createFloatConst(op->getLoc(), opType, 0.00, rewriter);
+  Value two = createFloatConst(op->getLoc(), opType, 2.00, rewriter);
+  Value negOne = createFloatConst(op->getLoc(), opType, -1.00, rewriter);
+  Value opASquared = b.create<arith::MulFOp>(opType, operandA, operandA);
+  Value opBHalf = b.create<arith::DivFOp>(opType, operandB, two);
 
-  Value logA = b.create<math::LogOp>(opType, operandA);
-  Value mult = b.create<arith::MulFOp>(opType, logA, operandB);
+  Value logA = b.create<math::LogOp>(opType, opASquared);
+  Value mult = b.create<arith::MulFOp>(opType, opBHalf, logA);
   Value expResult = b.create<math::ExpOp>(opType, mult);
-  rewriter.replaceOp(op, expResult);
+  Value negExpResult = b.create<arith::MulFOp>(opType, expResult, negOne);
+  Value remainder = b.create<arith::RemFOp>(opType, operandB, two);
+  Value negCheck =
+      b.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, operandA, zero);
+  Value oddPower =
+      b.create<arith::CmpFOp>(arith::CmpFPredicate::ONE, remainder, zero);
+  Value oddAndNeg = b.create<arith::AndIOp>(op->getLoc(), oddPower, negCheck);
+
+  Value res = b.create<arith::SelectOp>(op->getLoc(), oddAndNeg, negExpResult,
+                                        expResult);
+  rewriter.replaceOp(op, res);
   return success();
 }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 5f35adf0ddaab..658756c6a6e61 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -31,23 +31,17 @@ namespace {
 namespace saturated_arith {
 struct Wrapper {
   static Wrapper stride(int64_t v) {
-    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0}
-                                                    : Wrapper{false, v};
+    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
   static Wrapper offset(int64_t v) {
-    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0}
-                                                    : Wrapper{false, v};
+    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
   static Wrapper size(int64_t v) {
     return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
-  int64_t asOffset() {
-    return saturated ? ShapedType::kDynamic : v;
-  }
+  int64_t asOffset() { return saturated ? ShapedType::kDynamic : v; }
   int64_t asSize() { return saturated ? ShapedType::kDynamic : v; }
-  int64_t asStride() {
-    return saturated ? ShapedType::kDynamic : v;
-  }
+  int64_t asStride() { return saturated ? ShapedType::kDynamic : v; }
   bool operator==(Wrapper other) {
     return (saturated && other.saturated) ||
            (!saturated && !other.saturated && v == other.v);
@@ -732,8 +726,7 @@ bool CastOp::canFoldIntoConsumerOp(CastOp castOp) {
   for (auto it : llvm::zip(sourceStrides, resultStrides)) {
     auto ss = std::get<0>(it), st = std::get<1>(it);
     if (ss != st)
-      if (ShapedType::isDynamic(ss) &&
-          !ShapedType::isDynamic(st))
+      if (ShapedType::isDynamic(ss) && !ShapedType::isDynamic(st))
         return false;
   }
 
@@ -766,8 +759,7 @@ bool CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
       // same. They are also compatible if either one is dynamic (see
       // description of MemRefCastOp for details).
       auto checkCompatible = [](int64_t a, int64_t b) {
-        return (ShapedType::isDynamic(a) ||
-                ShapedType::isDynamic(b) || a == b);
+        return (ShapedType::isDynamic(a) || ShapedType::isDynamic(b) || a == b);
       };
       if (!checkCompatible(aOffset, bOffset))
         return false;
@@ -1890,8 +1882,7 @@ LogicalResult ReinterpretCastOp::verify() {
   // Match offset in result memref type and in static_offsets attribute.
   int64_t expectedOffset = getStaticOffsets().front();
   if (!ShapedType::isDynamic(resultOffset) &&
-      !ShapedType::isDynamic(expectedOffset) &&
-      resultOffset != expectedOffset)
+      !ShapedType::isDynamic(expectedOffset) && resultOffset != expectedOffset)
     return emitError("expected result type with offset = ")
            << expectedOffset << " instead of " << resultOffset;
 
@@ -2945,18 +2936,6 @@ static MemRefType getCanonicalSubViewResultType(
                          nonRankReducedType.getMemorySpace());
 }
 
-/// Compute the canonical result type of a SubViewOp. Call `inferResultType`
-/// to deduce the result type. Additionally, reduce the rank of the inferred
-/// result type if `currentResultType` is lower rank than `sourceType`.
-static MemRefType getCanonicalSubViewResultType(
-    MemRefType currentResultType, MemRefType sourceType,
-    ArrayRef<OpFoldResult> mixedOffsets, ArrayRef<OpFoldResult> mixedSizes,
-    ArrayRef<OpFoldResult> mixedStrides) {
-  return getCanonicalSubViewResultType(currentResultType, sourceType,
-                                       sourceType, mixedOffsets, mixedSizes,
-                                       mixedStrides);
-}
-
 Value mlir::memref::createCanonicalRankReducingSubViewOp(
     OpBuilder &b, Location loc, Value memref, ArrayRef<int64_t> targetShape) {
   auto memrefType = llvm::cast<MemRefType>(memref.getType());
@@ -3109,9 +3088,32 @@ struct SubViewReturnTypeCanonicalizer {
   MemRefType operator()(SubViewOp op, ArrayRef<OpFoldResult> mixedOffsets,
                         ArrayRef<OpFoldResult> mixedSizes,
                         ArrayRef<OpFoldResult> mixedStrides) {
-    return getCanonicalSubViewResultType(op.getType(), op.getSourceType(),
-                                         mixedOffsets, mixedSizes,
-                                         mixedStrides);
+    // Infer a memref type without taking into account any rank reductions.
+    MemRefType nonReducedType = cast<MemRefType>(SubViewOp::inferResultType(
+        op.getSourceType(), mixedOffsets, mixedSizes, mixedStrides));
+
+    // Directly return the non-rank reduced type if there are no dropped dims.
+    llvm::SmallBitVector droppedDims = op.getDroppedDims();
+    if (droppedDims.empty())
+      return nonReducedType;
+
+    // Take the strides and offset from the non-rank reduced type.
+    auto [nonReducedStrides, offset] = getStridesAndOffset(nonReducedType);
+
+    // Drop dims from shape and strides.
+    SmallVector<int64_t> targetShape;
+    SmallVector<int64_t> targetStrides;
+    for (int64_t i = 0; i < static_cast<int64_t>(mixedSizes.size()); ++i) {
+      if (droppedDims.test(i))
+        continue;
+      targetStrides.push_back(nonReducedStrides[i]);
+      targetShape.push_back(nonReducedType.getDimSize(i));
+    }
+
+    return MemRefType::get(targetShape, nonReducedType.getElementType(),
+                           StridedLayoutAttr::get(nonReducedType.getContext(),
+                                                  offset, targetStrides),
+                           nonReducedType.getMemorySpace());
   }
 };
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index aaa5e39cd2f3d..a7b516e1e8640 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1365,7 +1365,7 @@ ParseResult ForallOp::parse(OpAsmParser &parser, OperationState &result) {
   result.addAttribute("staticLowerBound", staticLbs);
   result.addAttribute("staticUpperBound", staticUbs);
   result.addAttribute("staticStep", staticSteps);
-  result.addAttribute("operand_segment_sizes",
+  result.addAttribute("operandSegmentSizes",
                       parser.getBuilder().getDenseI32ArrayAttr(
                           {static_cast<int32_t>(dynamicLbs.size()),
                            static_cast<int32_t>(dynamicUbs.size()),
@@ -1400,7 +1400,7 @@ void ForallOp::build(
   result.addAttribute(getStaticStepAttrName(result.name),
                       b.getDenseI64ArrayAttr(staticSteps));
   result.addAttribute(
-      "operand_segment_sizes",
+      "operandSegmentSizes",
       b.getDenseI32ArrayAttr({static_cast<int32_t>(dynamicLbs.size()),
                               static_cast<int32_t>(dynamicUbs.size()),
                               static_cast<int32_t>(dynamicSteps.size()),
@@ -1601,7 +1601,7 @@ struct ForallOpSingleOrZeroIterationDimsFolder
                                       op.getOutputs(), std::nullopt, nullptr);
     newOp.getBodyRegion().getBlocks().clear();
     // The new loop needs to keep all attributes from the old one, except for
-    // "operand_segment_sizes" and static loop bound attributes which capture
+    // "operandSegmentSizes" and static loop bound attributes which capture
     // the outdated information of the old iteration domain.
     SmallVector<StringAttr> elidedAttrs{newOp.getOperandSegmentSizesAttrName(),
                                         newOp.getStaticLowerBoundAttrName(),
@@ -2833,7 +2833,7 @@ ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
   if (parser.parseRegion(*body, ivs))
     return failure();
 
-  // Set `operand_segment_sizes` attribute.
+  // Set `operandSegmentSizes` attribute.
   result.addAttribute(
       ParallelOp::getOperandSegmentSizeAttr(),
       builder.getDenseI32ArrayAttr({static_cast<int32_t>(lower.size()),
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 325f986f97694..af41532670890 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1043,6 +1043,12 @@ std::pair<size_t, size_t> AliasInitializer::visitImpl(
 
 void AliasInitializer::markAliasNonDeferrable(size_t aliasIndex) {
   auto it = std::next(aliases.begin(), aliasIndex);
+
+  // If already marked non-deferrable stop the recursion.
+  // All children should already be marked non-deferrable as well.
+  if (!it->second.canBeDeferred)
+    return;
+
   it->second.canBeDeferred = false;
 
   // Propagate the non-deferrable flag to any child aliases.
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index 501f52b83e026..1de49769974ac 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -125,7 +125,8 @@ DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
     MLIRContext *ctx, TypeID interfaceKind, StringRef interfaceName) {
   for (auto *dialect : ctx->getLoadedDialects()) {
 #ifndef NDEBUG
-  dialect->handleUseOfUndefinedPromisedInterface(interfaceKind, interfaceName);
+    dialect->handleUseOfUndefinedPromisedInterface(interfaceKind,
+                                                   interfaceName);
 #endif
     if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
       interfaces.insert(interface);
@@ -243,8 +244,9 @@ void DialectRegistry::applyExtensions(Dialect *dialect) const {
     extension.apply(ctx, requiredDialects);
   };
 
-  for (const auto &extension : extensions)
-    applyExtension(*extension);
+  // Note: Additional extensions may be added while applying an extension.
+  for (int i = 0; i < static_cast<int>(extensions.size()); ++i)
+    applyExtension(*extensions[i]);
 }
 
 void DialectRegistry::applyExtensions(MLIRContext *ctx) const {
@@ -264,8 +266,9 @@ void DialectRegistry::applyExtensions(MLIRContext *ctx) const {
     extension.apply(ctx, requiredDialects);
   };
 
-  for (const auto &extension : extensions)
-    applyExtension(*extension);
+  // Note: Additional extensions may be added while applying an extension.
+  for (int i = 0; i < static_cast<int>(extensions.size()); ++i)
+    applyExtension(*extensions[i]);
 }
 
 bool DialectRegistry::isSubsetOf(const DialectRegistry &rhs) const {
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index fe4597f3df3d2..a562a00eb1953 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/Threading.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Support/FileUtilities.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/CommandLine.h"
@@ -424,6 +425,23 @@ LogicalResult OpPassManager::initialize(MLIRContext *context,
   return success();
 }
 
+llvm::hash_code OpPassManager::hash() {
+  llvm::hash_code hashCode{};
+  for (Pass &pass : getPasses()) {
+    // If this pass isn't an adaptor, directly hash it.
+    auto *adaptor = dyn_cast<OpToOpPassAdaptor>(&pass);
+    if (!adaptor) {
+      hashCode = llvm::hash_combine(hashCode, &pass);
+      continue;
+    }
+    // Otherwise, hash recursively each of the adaptors pass managers.
+    for (OpPassManager &adaptorPM : adaptor->getPassManagers())
+      llvm::hash_combine(hashCode, adaptorPM.hash());
+  }
+  return hashCode;
+}
+
+
 //===----------------------------------------------------------------------===//
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
@@ -825,10 +843,12 @@ LogicalResult PassManager::run(Operation *op) {
 
   // Initialize all of the passes within the pass manager with a new generation.
   llvm::hash_code newInitKey = context->getRegistryHash();
-  if (newInitKey != initializationKey) {
+  llvm::hash_code pipelineKey = hash();
+  if (newInitKey != initializationKey || pipelineKey != pipelineInitializationKey) {
     if (failed(initialize(context, impl->initializationGeneration + 1)))
       return failure();
     initializationKey = newInitKey;
+    pipelineKey = pipelineInitializationKey;
   }
 
   // Construct a top level analysis manager for the pipeline.
diff --git a/mlir/lib/Rewrite/ByteCode.cpp b/mlir/lib/Rewrite/ByteCode.cpp
index c8c442823781b..e7d4c4089a991 100644
--- a/mlir/lib/Rewrite/ByteCode.cpp
+++ b/mlir/lib/Rewrite/ByteCode.cpp
@@ -1846,7 +1846,7 @@ void ByteCodeExecutor::executeGetOperands() {
   ByteCodeField rangeIndex = read();
 
   void *result = executeGetOperandsResults<OpTrait::AttrSizedOperandSegments>(
-      op->getOperands(), op, index, rangeIndex, "operand_segment_sizes",
+      op->getOperands(), op, index, rangeIndex, "operandSegmentSizes",
       valueRangeMemory);
   if (!result)
     LLVM_DEBUG(llvm::dbgs() << "  * Invalid operand range\n");
@@ -1872,7 +1872,7 @@ void ByteCodeExecutor::executeGetResults() {
   ByteCodeField rangeIndex = read();
 
   void *result = executeGetOperandsResults<OpTrait::AttrSizedResultSegments>(
-      op->getResults(), op, index, rangeIndex, "result_segment_sizes",
+      op->getResults(), op, index, rangeIndex, "resultSegmentSizes",
       valueRangeMemory);
   if (!result)
     LLVM_DEBUG(llvm::dbgs() << "  * Invalid result range\n");
diff --git a/mlir/lib/Transforms/Canonicalizer.cpp b/mlir/lib/Transforms/Canonicalizer.cpp
index b4ad85c7c7dad..d50019bd6aee5 100644
--- a/mlir/lib/Transforms/Canonicalizer.cpp
+++ b/mlir/lib/Transforms/Canonicalizer.cpp
@@ -29,7 +29,8 @@ struct Canonicalizer : public impl::CanonicalizerBase<Canonicalizer> {
   Canonicalizer() = default;
   Canonicalizer(const GreedyRewriteConfig &config,
                 ArrayRef<std::string> disabledPatterns,
-                ArrayRef<std::string> enabledPatterns) {
+                ArrayRef<std::string> enabledPatterns)
+      : config(config) {
     this->topDownProcessingEnabled = config.useTopDownTraversal;
     this->enableRegionSimplification = config.enableRegionSimplification;
     this->maxIterations = config.maxIterations;
@@ -41,30 +42,31 @@ struct Canonicalizer : public impl::CanonicalizerBase<Canonicalizer> {
   /// Initialize the canonicalizer by building the set of patterns used during
   /// execution.
   LogicalResult initialize(MLIRContext *context) override {
+    // Set the config from possible pass options set in the meantime.
+    config.useTopDownTraversal = topDownProcessingEnabled;
+    config.enableRegionSimplification = enableRegionSimplification;
+    config.maxIterations = maxIterations;
+    config.maxNumRewrites = maxNumRewrites;
+
     RewritePatternSet owningPatterns(context);
     for (auto *dialect : context->getLoadedDialects())
       dialect->getCanonicalizationPatterns(owningPatterns);
     for (RegisteredOperationName op : context->getRegisteredOperations())
       op.getCanonicalizationPatterns(owningPatterns, context);
 
-    patterns = FrozenRewritePatternSet(std::move(owningPatterns),
-                                       disabledPatterns, enabledPatterns);
+    patterns = std::make_shared<FrozenRewritePatternSet>(
+        std::move(owningPatterns), disabledPatterns, enabledPatterns);
     return success();
   }
   void runOnOperation() override {
-    GreedyRewriteConfig config;
-    config.useTopDownTraversal = topDownProcessingEnabled;
-    config.enableRegionSimplification = enableRegionSimplification;
-    config.maxIterations = maxIterations;
-    config.maxNumRewrites = maxNumRewrites;
     LogicalResult converged =
-        applyPatternsAndFoldGreedily(getOperation(), patterns, config);
+        applyPatternsAndFoldGreedily(getOperation(), *patterns, config);
     // Canonicalization is best-effort. Non-convergence is not a pass failure.
     if (testConvergence && failed(converged))
       signalPassFailure();
   }
-
-  FrozenRewritePatternSet patterns;
+  GreedyRewriteConfig config;
+  std::shared_ptr<const FrozenRewritePatternSet> patterns;
 };
 } // namespace
 
diff --git a/mlir/test/Bytecode/operand_segment_sizes.mlir b/mlir/test/Bytecode/operand_segment_sizes.mlir
index 9791bd4e0f264..c0379c2994f49 100644
--- a/mlir/test/Bytecode/operand_segment_sizes.mlir
+++ b/mlir/test/Bytecode/operand_segment_sizes.mlir
@@ -2,7 +2,7 @@
 
 
 func.func @roundtripOperandSizeAttr(%arg0: i32) {
-  // CHECK: odsOperandSegmentSizes = array<i32: 0, 2, 1, 1>}>
-  "test.attr_sized_operands"(%arg0, %arg0, %arg0, %arg0) <{odsOperandSegmentSizes = array<i32: 0, 2, 1, 1>}> : (i32, i32, i32, i32) -> ()
+  // CHECK: operandSegmentSizes = array<i32: 0, 2, 1, 1>}>
+  "test.attr_sized_operands"(%arg0, %arg0, %arg0, %arg0) <{operandSegmentSizes = array<i32: 0, 2, 1, 1>}> : (i32, i32, i32, i32) -> ()
   return
 }
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index ab91729a0556b..b83b122f75e4b 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -79,7 +79,7 @@ func.func @wsloop(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4:
       // CHECK: "test.payload"(%[[CAST_ARG6]], %[[CAST_ARG7]]) : (index, index) -> ()
       "test.payload"(%arg6, %arg7) : (index, index) -> ()
       omp.yield
-    }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
+    }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
     omp.terminator
   }
   return
@@ -328,7 +328,7 @@ llvm.func @_QPsimple_reduction(%arg0: !llvm.ptr<array<100 x i32>> {fir.bindc_nam
   %5 = llvm.zext %2 : i1 to i32
   llvm.store %5, %4 : !llvm.ptr<i32>
   omp.parallel   {
-    %6 = llvm.alloca %3 x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+    %6 = llvm.alloca %3 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
     omp.wsloop   reduction(@eqv_reduction -> %4 : !llvm.ptr<i32>) for  (%arg1) : i32 = (%1) to (%0) inclusive step (%1) {
       llvm.store %arg1, %6 : !llvm.ptr<i32>
       %7 = llvm.load %6 : !llvm.ptr<i32>
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index e280cd65811db..80c65e14e7635 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -4,7 +4,7 @@ func.func @not_enough_sizes(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands, but found 5}}
   "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({
     gpu.return
-  }) {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index) -> ()
   return
 }
 
@@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) {
   ^bb1(%bx: index, %by: index, %bz: index,
        %tx: index, %ty: index, %tz: index):
     gpu.terminator
-  }) {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
   return
 }
 
@@ -38,7 +38,7 @@ func.func @launch_requires_gpu_return(%sz : index) {
 func.func @launch_func_too_few_operands(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands}}
   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
-      {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 0, 0>}
+      {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 0, 0>}
       : (index, index, index, index, index) -> ()
   return
 }
@@ -57,7 +57,7 @@ module attributes {gpu.container_module} {
   func.func @launch_func_missing_callee_attribute(%sz : index) {
     // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
-        {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0>}
+        {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0>}
         : (index, index, index, index, index, index) -> ()
     return
   }
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 14141c4c243ab..cf4697b17aa46 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -887,7 +887,7 @@ func.func @switch_wrong_number_of_weights(%arg0 : i32) {
 
 func.func @switch_case_type_mismatch(%arg0 : i64) {
   // expected-error@below {{expects case value type to match condition value type}}
-  "llvm.switch"(%arg0)[^bb1, ^bb2] <{case_operand_segments = array<i32: 0>, case_values = dense<42> : vector<1xi32>, odsOperandSegmentSizes = array<i32: 1, 0, 0>}> : (i64) -> ()
+  "llvm.switch"(%arg0)[^bb1, ^bb2] <{case_operand_segments = array<i32: 0>, case_values = dense<42> : vector<1xi32>, operandSegmentSizes = array<i32: 1, 0, 0>}> : (i64) -> ()
 ^bb1: // pred: ^bb0
   llvm.return
 ^bb2: // pred: ^bb0
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 8f00d54655327..b0bb06cc8654a 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -178,7 +178,7 @@ func.func @depthwise_conv_2d_input_nhwc_filter_default_attributes(%input: memref
 // -----
 
 func.func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_element_type_properties(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
-  // expected-error @+1 {{invalid properties {dilations = dense<1> : vector<2xi64>, operand_segment_sizes = array<i32: 2, 1>, strides = dense<2.000000e+00> : vector<2xf32>} for op linalg.depthwise_conv_2d_nhwc_hwc: Invalid attribute `strides` in property conversion: dense<2.000000e+00> : vector<2xf32>}}
+  // expected-error @+1 {{invalid properties {dilations = dense<1> : vector<2xi64>, operandSegmentSizes = array<i32: 2, 1>, strides = dense<2.000000e+00> : vector<2xf32>} for op linalg.depthwise_conv_2d_nhwc_hwc: Invalid attribute `strides` in property conversion: dense<2.000000e+00> : vector<2xf32>}}
   linalg.depthwise_conv_2d_nhwc_hwc <{dilations = dense<1> : vector<2xi64>, strides = dense<2.0> : vector<2xf32>}>
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
@@ -1100,7 +1100,7 @@ func.func @conv_interface_wrong_input_indexing_map(
       %1 = "arith.mulf"(%arg3, %arg4) : (f32, f32) -> f32
       %2 = "arith.addf"(%arg5, %1) : (f32, f32) -> f32
       "linalg.yield"(%2) : (f32) -> ()
-    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operand_segment_sizes = array<i32: 2, 1>, strides = dense<2> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operandSegmentSizes = array<i32: 2, 1>, strides = dense<2> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
   return %0 : tensor<?x?x?x?xf32>
 }
 
@@ -1117,7 +1117,7 @@ func.func @conv_interface_wrong_num_operands(
       %1 = "arith.mulf"(%arg3, %arg4) : (f32, f32) -> f32
       %2 = "arith.addf"(%arg5, %1) : (f32, f32) -> f32
       "linalg.yield"(%2) : (f32) -> ()
-    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operand_segment_sizes = array<i32: 2, 1>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operandSegmentSizes = array<i32: 2, 1>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
   return %0 : tensor<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index c28e2141db061..4cd6461102079 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -222,10 +222,21 @@ func.func @roundf_func(%a: f32) -> f32 {
 // CHECK-LABEL:   func @powf_func
 // CHECK-SAME:    ([[ARG0:%.+]]: f64, [[ARG1:%.+]]: f64)
 func.func @powf_func(%a: f64, %b: f64) ->f64 {
-  // CHECK-DAG: [[LOG:%.+]] = math.log [[ARG0]]
-  // CHECK-DAG: [[MULT:%.+]] = arith.mulf [[LOG]], [[ARG1]]
+  // CHECK-DAG = [[CST0:%.+]] = arith.constant 0.000000e+00
+  // CHECK-DAG: [[TWO:%.+]] = arith.constant 2.000000e+00
+  // CHECK-DAG: [[NEGONE:%.+]] = arith.constant -1.000000e+00
+  // CHECK-DAG: [[SQR:%.+]] = arith.mulf [[ARG0]], [[ARG0]]
+  // CHECK-DAG: [[HALF:%.+]] = arith.divf [[ARG1]], [[TWO]] 
+  // CHECK-DAG: [[LOG:%.+]] = math.log [[SQR]]
+  // CHECK-DAG: [[MULT:%.+]] = arith.mulf [[HALF]], [[LOG]]
   // CHECK-DAG: [[EXPR:%.+]] = math.exp [[MULT]]
-  // CHECK: return [[EXPR]]
+  // CHECK-DAG: [[NEGEXPR:%.+]] = arith.mulf [[EXPR]], [[NEGONE]]
+  // CHECK-DAG: [[REMF:%.+]] = arith.remf [[ARG1]], [[TWO]]
+  // CHECK-DAG: [[CMPNEG:%.+]] = arith.cmpf olt, [[ARG0]]
+  // CHECK-DAG: [[CMPZERO:%.+]] = arith.cmpf one, [[REMF]]
+  // CHECK-DAG: [[AND:%.+]] = arith.andi [[CMPZERO]], [[CMPNEG]]
+  // CHECK-DAG: [[SEL:%.+]] = arith.select [[AND]], [[NEGEXPR]], [[EXPR]]
+  // CHECK: return [[SEL]]
   %ret = math.powf %a, %b : f64
   return %ret : f64
 }
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index b65426cad30b6..df66705e83e0e 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -931,7 +931,7 @@ func.func @fold_multiple_memory_space_cast(%arg : memref<?xf32>) -> memref<?xf32
 
 // -----
 
-// CHECK-lABEL: func @ub_negative_alloc_size
+// CHECK-LABEL: func private @ub_negative_alloc_size
 func.func private @ub_negative_alloc_size() -> memref<?x?x?xi1> {
   %idx1 = index.constant 1
   %c-2 = arith.constant -2 : index
@@ -940,3 +940,18 @@ func.func private @ub_negative_alloc_size() -> memref<?x?x?xi1> {
   %alloc = memref.alloc(%c15, %c-2, %idx1) : memref<?x?x?xi1>
   return %alloc : memref<?x?x?xi1>
 }
+
+// -----
+
+// CHECK-LABEL: func @subview_rank_reduction(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<1x384x384xf32>, %[[arg1:.*]]: index
+func.func @subview_rank_reduction(%arg0: memref<1x384x384xf32>, %idx: index)
+    -> memref<?x?xf32, strided<[384, 1], offset: ?>> {
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[subview:.*]] = memref.subview %[[arg0]][0, %[[arg1]], %[[arg1]]] [1, 1, %[[arg1]]] [1, 1, 1] : memref<1x384x384xf32> to memref<1x?xf32, strided<[384, 1], offset: ?>>
+  // CHECK: %[[cast:.*]] = memref.cast %[[subview]] : memref<1x?xf32, strided<[384, 1], offset: ?>> to memref<?x?xf32, strided<[384, 1], offset: ?>>
+  %0 = memref.subview %arg0[0, %idx, %idx] [1, %c1, %idx] [1, 1, 1]
+      : memref<1x384x384xf32> to memref<?x?xf32, strided<[384, 1], offset: ?>>
+  // CHECK: return %[[cast]]
+  return %0 : memref<?x?xf32, strided<[384, 1], offset: ?>>
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index fc65fb77ffc88..009f08ced97e0 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -197,7 +197,7 @@ func.func @omp_simdloop(%lb : index, %ub : index, %step : i32) -> () {
   "omp.simdloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (index, index, i32) -> ()
 
   return
@@ -225,7 +225,7 @@ func.func @omp_simdloop_aligned_mismatch(%arg0 : index, %arg1 : index,
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [128],
-      operand_segment_sizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -238,7 +238,7 @@ func.func @omp_simdloop_aligned_negative(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [-1, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -251,7 +251,7 @@ func.func @omp_simdloop_unexpected_alignment(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0>} : (index, index, index) -> ()
+  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0>} : (index, index, index) -> ()
   return
 }
 
@@ -264,7 +264,7 @@ func.func @omp_simdloop_aligned_float(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1.5, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -277,7 +277,7 @@ func.func @omp_simdloop_aligned_the_same_var(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -291,7 +291,7 @@ func.func @omp_simdloop_nontemporal_the_same_var(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -1121,7 +1121,7 @@ func.func @omp_teams_allocate(%data_var : memref<i32>) {
     // expected-error @below {{expected equal sizes for allocate and allocator variables}}
     "omp.teams" (%data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
     omp.terminator
   }
   return
@@ -1134,7 +1134,7 @@ func.func @omp_teams_num_teams1(%lb : i32) {
     // expected-error @below {{expected num_teams upper bound to be defined if the lower bound is defined}}
     "omp.teams" (%lb) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
     omp.terminator
   }
   return
@@ -1159,7 +1159,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1169,7 +1169,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1284,7 +1284,7 @@ func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operand_segment_sizes = array<i32: 1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1294,7 +1294,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.task"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operand_segment_sizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
+    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -1486,7 +1486,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testmemref) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
   return
 }
 
@@ -1499,7 +1499,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1512,7 +1512,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1525,7 +1525,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32], operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
+  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1538,7 +1538,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32, @add_f32], operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
+  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 2f0d224a3fef7..be59defd27d03 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -59,7 +59,7 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%num_threads, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 0,1,1,1,0>} : (i32, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,1,1,1,0>} : (i32, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
@@ -68,22 +68,22 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%if_cond, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,0,1,1,0>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,1,1,0>} : (i1, memref<i32>, memref<i32>) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,1,0,0,0>} : (i1, i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,0,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
 
   return
 }
@@ -141,7 +141,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0,0>, ordered_val = 1} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0,0>, ordered_val = 1} :
     (index, index, index) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static)
@@ -149,7 +149,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,1,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,1,0,0>, schedule_val = #omp<schedulekind static>} :
     (index, index, index, memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static)
@@ -157,7 +157,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,2,2,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,2,2,0,0>, schedule_val = #omp<schedulekind static>} :
     (index, index, index, memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) ordered(2)
@@ -165,7 +165,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var, %chunk_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
     (index, index, index, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop schedule(auto) nowait
@@ -173,7 +173,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
     (index, index, index) -> ()
 
   return
@@ -333,7 +333,7 @@ func.func @omp_simdloop(%lb : index, %ub : index, %step : index) -> () {
   "omp.simdloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (index, index, index) -> ()
 
   return
@@ -349,7 +349,7 @@ func.func @omp_simdloop_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [32, 128],
-      operand_segment_sizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -362,7 +362,7 @@ func.func @omp_simdloop_aligned_single(%arg0 : index, %arg1 : index, %arg2 : ind
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [32],
-      operand_segment_sizes = array<i32: 1, 1, 1, 1, 0, 0>} : (index, index, index, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0>} : (index, index, index, memref<i32>) -> ()
   return
 }
 
@@ -377,7 +377,7 @@ func.func @omp_simdloop_nontemporal_list(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i64>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i64>) -> ()
   return
 }
 
@@ -392,7 +392,7 @@ func.func @omp_simdloop_nontemporal_single(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 1>} : (index, index, index, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1>} : (index, index, index, memref<i32>) -> ()
   return
 }
 
@@ -487,7 +487,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1:
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operand_segment_sizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: omp.target map((tofrom -> %{{.*}} : memref<?xi32>), (alloc -> %{{.*}} : memref<?xi32>)) {
@@ -1428,13 +1428,13 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr<f32>)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,0,0>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {
diff --git a/mlir/test/Dialect/PDL/invalid.mlir b/mlir/test/Dialect/PDL/invalid.mlir
index c76bc9dcad72d..c6b7fe1cc1789 100644
--- a/mlir/test/Dialect/PDL/invalid.mlir
+++ b/mlir/test/Dialect/PDL/invalid.mlir
@@ -122,7 +122,7 @@ pdl.pattern : benefit(1) {
   // expected-error@below {{expected the same number of attribute values and attribute names, got 1 names and 0 values}}
   %op = "pdl.operation"() {
     attributeValueNames = ["attr"],
-    operand_segment_sizes = array<i32: 0, 0, 0>
+    operandSegmentSizes = array<i32: 0, 0, 0>
   } : () -> (!pdl.operation)
   rewrite %op with "rewriter"
 }
@@ -230,7 +230,7 @@ pdl.pattern : benefit(1) {
 
     // expected-error@below {{expected no replacement values to be provided when the replacement operation is present}}
     "pdl.replace"(%root, %newOp, %newResult) {
-      operand_segment_sizes = array<i32: 1, 1, 1>
+      operandSegmentSizes = array<i32: 1, 1, 1>
     } : (!pdl.operation, !pdl.operation, !pdl.value) -> ()
   }
 }
@@ -276,7 +276,7 @@ pdl.pattern : benefit(1) {
 
   // expected-error@below {{expected rewrite region to be non-empty if external name is not specified}}
   "pdl.rewrite"(%op) ({}) {
-    operand_segment_sizes = array<i32: 1,0>
+    operandSegmentSizes = array<i32: 1,0>
   } : (!pdl.operation) -> ()
 }
 
@@ -289,7 +289,7 @@ pdl.pattern : benefit(1) {
   "pdl.rewrite"(%op, %op) ({
     ^bb1:
   }) {
-    operand_segment_sizes = array<i32: 1, 1>
+    operandSegmentSizes = array<i32: 1, 1>
   }: (!pdl.operation, !pdl.operation) -> ()
 }
 
@@ -303,7 +303,7 @@ pdl.pattern : benefit(1) {
     ^bb1:
   }) {
     name = "foo",
-    operand_segment_sizes = array<i32: 1,0>
+    operandSegmentSizes = array<i32: 1,0>
   } : (!pdl.operation) -> ()
 }
 
diff --git a/mlir/test/Dialect/PDLInterp/invalid.mlir b/mlir/test/Dialect/PDLInterp/invalid.mlir
index 0457a158430a2..c201dda71ef7f 100644
--- a/mlir/test/Dialect/PDLInterp/invalid.mlir
+++ b/mlir/test/Dialect/PDLInterp/invalid.mlir
@@ -19,7 +19,7 @@ pdl_interp.func @rewriter() {
     inferredResultTypes,
     inputAttributeNames = [],
     name = "foo.op",
-    operand_segment_sizes = array<i32: 0, 0, 1>
+    operandSegmentSizes = array<i32: 0, 0, 1>
   } : (!pdl.type) -> (!pdl.operation)
   pdl_interp.finalize
 }
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 8566943ef8012..0ff3eaadc8fec 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -139,7 +139,7 @@ func.func @parallel_body_arguments_wrong_type(
   "scf.parallel"(%arg0, %arg1, %arg2) ({
     ^bb0(%i0: f32):
       scf.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
   return
 }
 
@@ -151,7 +151,7 @@ func.func @parallel_body_wrong_number_of_arguments(
   "scf.parallel"(%arg0, %arg1, %arg2) ({
     ^bb0(%i0: index, %i1: index):
       scf.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
   return
 }
 
@@ -689,7 +689,7 @@ func.func @parallel_missing_terminator(%0 : index) {
   ^bb0(%arg1: index):
     // expected-note @below {{terminator here}}
     %2 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>} : (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>} : (index, index, index) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
index af3f3ea2889f7..8496448759f0c 100644
--- a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
@@ -117,7 +117,7 @@ func.func @wrong_condition_type() -> () {
 func.func @wrong_accessor_count() -> () {
   %true = spirv.Constant true
   // expected-error @+1 {{requires 2 successors but found 1}}
-  "spirv.BranchConditional"(%true)[^one] {operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+  "spirv.BranchConditional"(%true)[^one] {operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 ^one:
   spirv.Return
 ^two:
@@ -130,7 +130,7 @@ func.func @wrong_number_of_weights() -> () {
   %true = spirv.Constant true
   // expected-error @+1 {{must have exactly two branch weights}}
   "spirv.BranchConditional"(%true)[^one, ^two] {branch_weights = [1 : i32, 2 : i32, 3 : i32],
-                                              operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+                                              operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 ^one:
   spirv.Return
 ^two:
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
index c72af7363f67f..3e30947769eb4 100644
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -76,7 +76,7 @@ transform.sequence failures(propagate) {
 "transform.sequence"(%0) ({
 ^bb0(%arg0: !transform.any_op):
   "transform.yield"() : () -> ()
-}) {failure_propagation_mode = 1 : i32, operand_segment_sizes = array<i32: 0, 1>} : (!transform.any_op) -> ()
+}) {failure_propagation_mode = 1 : i32, operandSegmentSizes = array<i32: 0, 1>} : (!transform.any_op) -> ()
 
 // -----
 
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index 66c9adca8f98c..0193fae37af7f 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -460,7 +460,7 @@ func.func @verbose_terminators() -> (i1, i17) {
 
 ^bb1(%x : i1, %y : i17):
 // CHECK:  cf.cond_br %{{.*}}, ^bb2(%{{.*}} : i17), ^bb3(%{{.*}}, %{{.*}} : i1, i17)
-  "cf.cond_br"(%x, %y, %x, %y) [^bb2, ^bb3] {operand_segment_sizes = array<i32: 1, 1, 2>} : (i1, i17, i1, i17) -> ()
+  "cf.cond_br"(%x, %y, %x, %y) [^bb2, ^bb3] {operandSegmentSizes = array<i32: 1, 1, 2>} : (i1, i17, i1, i17) -> ()
 
 ^bb2(%a : i17):
   %true = arith.constant true
diff --git a/mlir/test/IR/recursive-type.mlir b/mlir/test/IR/recursive-type.mlir
index bc9b2cdbea6b6..121ba095573ba 100644
--- a/mlir/test/IR/recursive-type.mlir
+++ b/mlir/test/IR/recursive-type.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -test-recursive-types | FileCheck %s
 
 // CHECK: !testrec = !test.test_rec<type_to_alias, test_rec<type_to_alias>>
+// CHECK: ![[$NAME:.*]] = !test.test_rec_alias<name, !test.test_rec_alias<name>>
+// CHECK: ![[$NAME2:.*]] = !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
 
 // CHECK-LABEL: @roundtrip
 func.func @roundtrip() {
@@ -12,6 +14,16 @@ func.func @roundtrip() {
   // into inifinite recursion.
   // CHECK: !testrec
   "test.dummy_op_for_roundtrip"() : () -> !test.test_rec<type_to_alias, test_rec<type_to_alias>>
+
+  // CHECK: () -> ![[$NAME]]
+  // CHECK: () -> ![[$NAME]]
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name, !test.test_rec_alias<name>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name, !test.test_rec_alias<name>>
+
+  // CHECK: () -> ![[$NAME2]]
+  // CHECK: () -> ![[$NAME2]]
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
   return
 }
 
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 7d922ecf67de5..0402ebe758750 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -383,101 +383,101 @@ func.func private @foo()
 // -----
 
 func.func @failedMissingOperandSizeAttr(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
   "test.attr_sized_operands"(%arg, %arg, %arg, %arg) : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongType(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = 10} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = 10} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongElementType(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i64: 1, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i64: 1, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrNegativeValue(%arg: i32) {
-  // expected-error @+1 {{'operand_segment_sizes' attribute cannot have negative elements}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 1, 1, -1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{'operandSegmentSizes' attribute cannot have negative elements}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 1, 1, -1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongTotalSize(%arg: i32) {
-  // expected-error @+1 {{operand count (4) does not match with the total size (3) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 0, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{operand count (4) does not match with the total size (3) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 0, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongCount(%arg: i32) {
-  // expected-error @+1 {{test.attr_sized_operands' op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 2, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{test.attr_sized_operands' op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 2, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @succeededOperandSizeAttr(%arg: i32) {
   // CHECK: test.attr_sized_operands
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 0, 2, 1, 1>} : (i32, i32, i32, i32) -> ()
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 0, 2, 1, 1>} : (i32, i32, i32, i32) -> ()
   return
 }
 
 // -----
 
 func.func @failedMissingResultSizeAttr() {
-  // expected-error @+1 {{op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
+  // expected-error @+1 {{op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
   %0:4 = "test.attr_sized_results"() : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongType() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = 10} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = 10} : () -> (i32, i32, i32, i32)
 }
 
 
 // -----
 
 func.func @failedResultSizeAttrWrongElementType() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i64: 1, 1, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i64: 1, 1, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrNegativeValue() {
-  // expected-error @+1 {{'result_segment_sizes' attribute cannot have negative elements}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 1, 1, -1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{'resultSegmentSizes' attribute cannot have negative elements}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 1, 1, -1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongTotalSize() {
-  // expected-error @+1 {{result count (4) does not match with the total size (3) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 1, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{result count (4) does not match with the total size (3) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 1, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongCount() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 2, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 2, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @succeededResultSizeAttr() {
   // CHECK: test.attr_sized_results
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 2, 1, 1>} : () -> (i32, i32, i32, i32)
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 2, 1, 1>} : () -> (i32, i32, i32, i32)
   return
 }
 
diff --git a/mlir/test/Rewrite/pdl-bytecode.mlir b/mlir/test/Rewrite/pdl-bytecode.mlir
index 57bec8ce37073..513ff3c40bc64 100644
--- a/mlir/test/Rewrite/pdl-bytecode.mlir
+++ b/mlir/test/Rewrite/pdl-bytecode.mlir
@@ -1093,7 +1093,7 @@ module @patterns {
 // CHECK-NEXT:  "test.success"(%[[INPUTS]]#4) : (i32) -> ()
 module @ir attributes { test.get_operands_2 } {
   %inputs:5 = "test.producer"() : () -> (i32, i32, i32, i32, i32)
-  "test.attr_sized_operands"(%inputs#0, %inputs#1, %inputs#2, %inputs#3, %inputs#4) {operand_segment_sizes = array<i32: 0, 4, 1, 0>} : (i32, i32, i32, i32, i32) -> ()
+  "test.attr_sized_operands"(%inputs#0, %inputs#1, %inputs#2, %inputs#3, %inputs#4) {operandSegmentSizes = array<i32: 0, 4, 1, 0>} : (i32, i32, i32, i32, i32) -> ()
 }
 
 // -----
@@ -1246,7 +1246,7 @@ module @patterns {
 // CHECK: %[[RESULTS_2_SINGLE:.*]] = "test.success"() : () -> i32
 // CHECK: "test.consumer"(%[[RESULTS_1]]#0, %[[RESULTS_1]]#1, %[[RESULTS_1]]#2, %[[RESULTS_1]]#3, %[[RESULTS_2]]) : (i32, i32, i32, i32, i32) -> ()
 module @ir attributes { test.get_results_2 } {
-  %results:5 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 4, 1, 0>} : () -> (i32, i32, i32, i32, i32)
+  %results:5 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 4, 1, 0>} : () -> (i32, i32, i32, i32, i32)
   "test.consumer"(%results#0, %results#1, %results#2, %results#3, %results#4) : (i32, i32, i32, i32, i32) -> ()
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 1573f30d5b391..a58d4f1463a0b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -2,7 +2,7 @@
 
 llvm.func @_QPopenmp_target_data() {
   %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr<i32>
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr<i32>
   omp.target_data   map((tofrom -> %1 : !llvm.ptr<i32>)) {
     %2 = llvm.mlir.constant(99 : i32) : i32
     llvm.store %2, %1 : !llvm.ptr<i32>
@@ -79,9 +79,9 @@ llvm.func @_QPopenmp_target_data_region(%1 : !llvm.ptr<array<1024 x i32>>) {
 
 llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr<array<1024 x i32>>, %3 : !llvm.ptr<array<512 x i32>>) {
   %4 = llvm.mlir.constant(1 : i64) : i64
-  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr<i32>
+  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr<i32>
   %6 = llvm.mlir.constant(1 : i64) : i64
-  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr<i32>
+  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr<i32>
   %8 = llvm.mlir.constant(5 : i32) : i32
   llvm.store %8, %7 : !llvm.ptr<i32>
   %9 = llvm.mlir.constant(2 : i32) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index 126fff70ce3b1..bead0200b2731 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = true} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target   {
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
index e130f96af79f7..9ba083d5137d8 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = false} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target   {
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
index cfa8039c94ba2..7f5e79db9bcd6 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = false} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target {
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 6469868b8751f..4fb00660cc423 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -310,7 +310,7 @@ llvm.func @wsloop_simple(%arg0: !llvm.ptr<f32>) {
       llvm.store %3, %4 : !llvm.ptr<f32>
       omp.yield
       // CHECK: call void @__kmpc_for_static_fini(ptr @[[$loc_struct]],
-    }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+    }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
     omp.terminator
   }
   llvm.return
@@ -330,7 +330,7 @@ llvm.func @wsloop_inclusive_1(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
   llvm.return
 }
 
@@ -348,7 +348,7 @@ llvm.func @wsloop_inclusive_2(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {inclusive, operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  }) {inclusive, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
   llvm.return
 }
 
@@ -628,7 +628,7 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr<f
       %4 = llvm.getelementptr %arg0[%iv] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
       llvm.store %3, %4 : !llvm.ptr<f32>
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (i64, i64, i64) -> ()
 
   llvm.return
@@ -733,9 +733,9 @@ llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %ste
 // CHECK-LABEL: @simdloop_if
 llvm.func @simdloop_if(%arg0: !llvm.ptr<i32> {fir.bindc_name = "n"}, %arg1: !llvm.ptr<i32> {fir.bindc_name = "threshold"}) {
   %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
+  %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
   %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.alloca %2 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_simdEi"} : (i64) -> !llvm.ptr<i32>
+  %3 = llvm.alloca %2 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_simdEi"} : (i64) -> !llvm.ptr<i32>
   %4 = llvm.mlir.constant(0 : i32) : i32
   %5 = llvm.load %arg0 : !llvm.ptr<i32>
   %6 = llvm.mlir.constant(1 : i32) : i32
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index d33c911e042d7..bf44973ab646c 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -257,7 +257,7 @@ func.func @nomerge(%arg0: i32, %i: i32) {
 func.func @mismatch_dominance() -> i32 {
   // CHECK: %[[RES:.*]] = "test.producing_br"()
   %0 = "test.producing_br"()[^bb1, ^bb2] {
-        operand_segment_sizes = array<i32: 0, 0>
+        operandSegmentSizes = array<i32: 0, 0>
 	} : () -> i32
 
 ^bb1:
diff --git a/mlir/test/Transforms/sccp.mlir b/mlir/test/Transforms/sccp.mlir
index db24432b65cc6..dcae052c29c24 100644
--- a/mlir/test/Transforms/sccp.mlir
+++ b/mlir/test/Transforms/sccp.mlir
@@ -204,7 +204,7 @@ func.func @simple_produced_operand() -> (i32, i32) {
   // CHECK: %[[ONE:.*]] = arith.constant 1
   %1 = arith.constant 1 : i32
   "test.internal_br"(%1) [^bb1, ^bb2] {
-    operand_segment_sizes = array<i32: 0, 1>
+    operandSegmentSizes = array<i32: 0, 1>
   } : (i32) -> ()
 
 ^bb1:
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 072f6ff4b84d3..debe733f59be4 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -312,6 +312,10 @@ struct TestOpAsmInterface : public OpAsmDialectInterface {
         return AliasResult::FinalAlias;
       }
     }
+    if (auto recAliasType = dyn_cast<TestRecursiveAliasType>(type)) {
+      os << recAliasType.getName();
+      return AliasResult::FinalAlias;
+    }
     return AliasResult::NoAlias;
   }
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 4eb19e6dd6fe2..12a02cf72d2b3 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -264,6 +264,16 @@ def DerivedTypeAttrOp : TEST_Op<"derived_type_attr", []> {
     "$_builder.getI32IntegerAttr($_self)">;
 }
 
+def TestPropOp : TEST_Op<"prop">,
+  Arguments<(ins Variadic<Index>:$upperInits,
+      I32ElementsAttr:$transforms)>,
+  Results<(outs Variadic<AnyType>:$results)> {
+  DerivedAttr upperLen = DerivedAttr<"uint32_t", [{
+    return getUpperInits().size() / getTransforms().size();
+  }], [{ $_builder.getI32IntegerAttr($_self) }]>;
+}
+
+
 def StringElementsAttrOp : TEST_Op<"string_elements_attr"> {
   let arguments = (ins
       StringElementsAttr:$scalar_string_attr
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 15dbd74aec118..2a8bdad8fb25d 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -369,4 +369,26 @@ def TestTypeElseAnchorStruct : Test_Type<"TestTypeElseAnchorStruct"> {
   let assemblyFormat = "`<` (`?`) : (struct($a, $b)^)? `>`";
 }
 
+def TestI32 : Test_Type<"TestI32"> {
+  let mnemonic = "i32";
+}
+
+def TestRecursiveAlias
+    : Test_Type<"TestRecursiveAlias", [NativeTypeTrait<"IsMutable">]> {
+  let mnemonic = "test_rec_alias";
+  let storageClass = "TestRecursiveTypeStorage";
+  let storageNamespace = "test";
+  let genStorageClass = 0;
+
+  let parameters = (ins "llvm::StringRef":$name);
+
+  let hasCustomAssemblyFormat = 1;
+
+  let extraClassDeclaration = [{
+    Type getBody() const;
+
+    void setBody(Type type);
+  }];
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 0633752067a14..20dc03a765269 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -482,3 +482,54 @@ void TestDialect::printType(Type type, DialectAsmPrinter &printer) const {
   SetVector<Type> stack;
   printTestType(type, printer, stack);
 }
+
+Type TestRecursiveAliasType::getBody() const { return getImpl()->body; }
+
+void TestRecursiveAliasType::setBody(Type type) { (void)Base::mutate(type); }
+
+StringRef TestRecursiveAliasType::getName() const { return getImpl()->name; }
+
+Type TestRecursiveAliasType::parse(AsmParser &parser) {
+  thread_local static SetVector<Type> stack;
+
+  StringRef name;
+  if (parser.parseLess() || parser.parseKeyword(&name))
+    return Type();
+  auto rec = TestRecursiveAliasType::get(parser.getContext(), name);
+
+  // If this type already has been parsed above in the stack, expect just the
+  // name.
+  if (stack.contains(rec)) {
+    if (failed(parser.parseGreater()))
+      return Type();
+    return rec;
+  }
+
+  // Otherwise, parse the body and update the type.
+  if (failed(parser.parseComma()))
+    return Type();
+  stack.insert(rec);
+  Type subtype;
+  if (parser.parseType(subtype))
+    return nullptr;
+  stack.pop_back();
+  if (!subtype || failed(parser.parseGreater()))
+    return Type();
+
+  rec.setBody(subtype);
+
+  return rec;
+}
+
+void TestRecursiveAliasType::print(AsmPrinter &printer) const {
+  thread_local static SetVector<Type> stack;
+
+  printer << "<" << getName();
+  if (!stack.contains(*this)) {
+    printer << ", ";
+    stack.insert(*this);
+    printer << getBody();
+    stack.pop_back();
+  }
+  printer << ">";
+}
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h
index c7d169d020d56..0ce86dd70ab90 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.h
+++ b/mlir/test/lib/Dialect/Test/TestTypes.h
@@ -91,9 +91,6 @@ struct FieldParser<std::optional<int>> {
 
 #include "TestTypeInterfaces.h.inc"
 
-#define GET_TYPEDEF_CLASSES
-#include "TestTypeDefs.h.inc"
-
 namespace test {
 
 /// Storage for simple named recursive types, where the type is identified by
@@ -150,4 +147,7 @@ class TestRecursiveType
 
 } // namespace test
 
+#define GET_TYPEDEF_CLASSES
+#include "TestTypeDefs.h.inc"
+
 #endif // MLIR_TESTTYPES_H
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 30f30def56fdd..847c41fec9135 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -195,7 +195,7 @@ func.func @powf() {
   %a_p = arith.constant 2.0 : f64
   call @func_powff64(%a, %a_p) : (f64, f64) -> ()
 
-  // CHECK-NEXT: nan
+  // CHECK-NEXT: -27
   %b   = arith.constant -3.0 : f64
   %b_p = arith.constant 3.0 : f64
   call @func_powff64(%b, %b_p) : (f64, f64) -> ()
@@ -220,16 +220,9 @@ func.func @powf() {
   %f_p  = arith.constant 1.2 : f64
   call @func_powff64(%f, %f_p) : (f64, f64) -> ()
 
-  // CHECK-NEXT: nan
-  %g    = arith.constant 0xff80000000000000 : f64
-  call @func_powff64(%g, %g) : (f64, f64) -> ()
-
-  // CHECK-NEXT: nan
-  %h = arith.constant 0x7fffffffffffffff : f64
-  call @func_powff64(%h, %h) : (f64, f64) -> ()
-
   // CHECK-NEXT: nan
   %i = arith.constant 1.0 : f64
+  %h = arith.constant 0x7fffffffffffffff : f64
   call @func_powff64(%i, %h) : (f64, f64) -> ()
 
   // CHECK-NEXT: inf
diff --git a/mlir/test/mlir-tblgen/attr-or-type-format.td b/mlir/test/mlir-tblgen/attr-or-type-format.td
index 230fa90713f1a..2782f55bc966e 100644
--- a/mlir/test/mlir-tblgen/attr-or-type-format.td
+++ b/mlir/test/mlir-tblgen/attr-or-type-format.td
@@ -648,5 +648,5 @@ def TypeN : TestType<"TestP"> {
 // DEFAULT_TYPE_PARSER: TestDialect::parseType(::mlir::DialectAsmParser &parser)
 // DEFAULT_TYPE_PARSER: auto parseResult = parseOptionalDynamicType(mnemonic, parser, genType);
 // DEFAULT_TYPE_PARSER: if (parseResult.has_value()) {
-// DEFAULT_TYPE_PARSER:   if (::mlir::succeeded(parseResult.getValue()))
+// DEFAULT_TYPE_PARSER:   if (::mlir::succeeded(parseResult.value()))
 // DEFAULT_TYPE_PARSER:     return genType;
\ No newline at end of file
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index aad7ea4437e78..077aa750352e0 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -141,7 +141,7 @@ def NS_AttrSizedOperandOp : NS_Op<"attr_sized_operands",
     Variadic<I32>:$b,
     I32:$c,
     Variadic<I32>:$d,
-    I32ElementsAttr:$operand_segment_sizes
+    I32ElementsAttr:$operandSegmentSizes
   );
 }
 
diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td
index de979f7e8f43e..a131209fa45cb 100644
--- a/mlir/test/mlir-tblgen/op-python-bindings.td
+++ b/mlir/test/mlir-tblgen/op-python-bindings.td
@@ -39,7 +39,7 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
   // CHECK: def variadic1(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 0)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 0)
   // CHECK:   return operand_range
   // CHECK-NOT: if len(operand_range)
   //
@@ -47,14 +47,14 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
   // CHECK: def non_variadic(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 1)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 1)
   // CHECK:   return operand_range[0]
   //
   // CHECK: @builtins.property
   // CHECK: def variadic2(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 2)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 2)
   // CHECK:   return operand_range[0] if len(operand_range) > 0 else None
   let arguments = (ins Variadic<AnyType>:$variadic1, AnyType:$non_variadic,
                    Optional<AnyType>:$variadic2);
@@ -83,21 +83,21 @@ def AttrSizedResultsOp : TestOp<"attr_sized_results",
   // CHECK: def variadic1(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 0)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 0)
   // CHECK:   return result_range[0] if len(result_range) > 0 else None
   //
   // CHECK: @builtins.property
   // CHECK: def non_variadic(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 1)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 1)
   // CHECK:   return result_range[0]
   //
   // CHECK: @builtins.property
   // CHECK: def variadic2(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 2)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 2)
   // CHECK:   return result_range
   // CHECK-NOT: if len(result_range)
   let results = (outs Optional<AnyType>:$variadic1, AnyType:$non_variadic,
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 88f48d0d544e7..b728e00837814 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -100,7 +100,7 @@ def named_form(lhs, rhs):
                 init_result = tensor.EmptyOp([4, 8], f32)
                 #      CHECK: "linalg.matmul"(%{{.*}})
                 # CHECK-SAME:    cast = #linalg.type_fn<cast_signed>
-                # CHECK-SAME:    odsOperandSegmentSizes = array<i32: 2, 1>
+                # CHECK-SAME:    operandSegmentSizes = array<i32: 2, 1>
                 # CHECK-NEXT:  ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32):
                 # CHECK-NEXT:    arith.mulf{{.*}} (f32, f32) -> f32
                 # CHECK-NEXT:    arith.addf{{.*}} (f32, f32) -> f32
diff --git a/mlir/test/python/dialects/ods_helpers.py b/mlir/test/python/dialects/ods_helpers.py
index 71879bdcb51f5..0d2a18e0eb0af 100644
--- a/mlir/test/python/dialects/ods_helpers.py
+++ b/mlir/test/python/dialects/ods_helpers.py
@@ -96,8 +96,8 @@ class TestOp(OpView):
             # CHECK: %[[V0:.+]] = "custom.value"
             # CHECK: %[[V1:.+]] = "custom.value"
             # CHECK: "custom.test_op"(%[[V0]], %[[V1]])
-            # CHECK-NOT: operand_segment_sizes
-            # CHECK-NOT: result_segment_sizes
+            # CHECK-NOT: operandSegmentSizes
+            # CHECK-NOT: resultSegmentSizes
             # CHECK-SAME: : (i32, i32) -> (i8, i16)
             print(m)
 
@@ -128,8 +128,8 @@ class TestOp(OpView):
             # CHECK: %[[V2:.+]] = "custom.value"
             # CHECK: %[[V3:.+]] = "custom.value"
             # CHECK: "custom.test_op"(%[[V0]], %[[V1]], %[[V2]], %[[V3]])
-            # CHECK-SAME: operand_segment_sizes = array<i32: 1, 2, 1>
-            # CHECK-SAME: result_segment_sizes = array<i32: 2, 1, 1>
+            # CHECK-SAME: operandSegmentSizes = array<i32: 1, 2, 1>
+            # CHECK-SAME: resultSegmentSizes = array<i32: 2, 1, 1>
             # CHECK-SAME: : (i32, i32, i32, i32) -> (i8, i16, i32, i64)
             op = TestOp.build_generic(
                 results=[[t0, t1], t2, t3], operands=[v0, [v1, v2], v3]
@@ -137,8 +137,8 @@ class TestOp(OpView):
 
             # Now test with optional omitted.
             # CHECK: "custom.test_op"(%[[V0]])
-            # CHECK-SAME: operand_segment_sizes = array<i32: 1, 0, 0>
-            # CHECK-SAME: result_segment_sizes = array<i32: 0, 0, 1>
+            # CHECK-SAME: operandSegmentSizes = array<i32: 1, 0, 0>
+            # CHECK-SAME: resultSegmentSizes = array<i32: 0, 0, 1>
             # CHECK-SAME: (i32) -> i64
             op = TestOp.build_generic(
                 results=[None, None, t3], operands=[v0, None, None]
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 943e323c6af40..f6e43d42d29f0 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -763,7 +763,7 @@ static const char *const dialectDynamicTypeParserDispatch = R"(
   {
     auto parseResult = parseOptionalDynamicType(mnemonic, parser, genType);
     if (parseResult.has_value()) {
-      if (::mlir::succeeded(parseResult.getValue()))
+      if (::mlir::succeeded(parseResult.value()))
         return genType;
       return ::mlir::Type();
     }
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0ff72ec75f1d4..a3b9c71048422 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -56,8 +56,8 @@ static const char *const propertyDiag = "propDiag";
 
 /// The names of the implicit attributes that contain variadic operand and
 /// result segment sizes.
-static const char *const operandSegmentAttrName = "operand_segment_sizes";
-static const char *const resultSegmentAttrName = "result_segment_sizes";
+static const char *const operandSegmentAttrName = "operandSegmentSizes";
+static const char *const resultSegmentAttrName = "resultSegmentSizes";
 
 /// Code for an Op to lookup an attribute. Uses cached identifiers and subrange
 /// lookup.
@@ -158,24 +158,24 @@ static const char *const valueRangeReturnCode = R"(
 /// Read operand/result segment_size from bytecode.
 static const char *const readBytecodeSegmentSize = R"(
 if ($_reader.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6) {
-  DenseI32ArrayAttr attr;
-  if (::mlir::failed($_reader.readAttribute(attr))) return failure();
+  ::mlir::DenseI32ArrayAttr attr;
+  if (::mlir::failed($_reader.readAttribute(attr))) return ::mlir::failure();
   if (attr.size() > static_cast<int64_t>(sizeof($_storage) / sizeof(int32_t))) {
     $_reader.emitError("size mismatch for operand/result_segment_size");
-    return failure();
+    return ::mlir::failure();
   }
-  llvm::copy(ArrayRef<int32_t>(attr), $_storage.begin());
+  llvm::copy(::llvm::ArrayRef<int32_t>(attr), $_storage.begin());
 } else {
-  return $_reader.readSparseArray(MutableArrayRef($_storage));
+  return $_reader.readSparseArray(::llvm::MutableArrayRef($_storage));
 }
 )";
 
 /// Write operand/result segment_size to bytecode.
 static const char *const writeBytecodeSegmentSize = R"(
 if ($_writer.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6)
-  $_writer.writeAttribute(DenseI32ArrayAttr::get(getContext(), $_storage));
+  $_writer.writeAttribute(::mlir::DenseI32ArrayAttr::get(getContext(), $_storage));
 else
-  $_writer.writeSparseArray(ArrayRef($_storage));
+  $_writer.writeSparseArray(::llvm::ArrayRef($_storage));
 )";
 
 /// A header for indicating code sections.
@@ -430,15 +430,15 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         /*interfaceType=*/"::llvm::ArrayRef<int32_t>",
         /*convertFromStorageCall=*/"$_storage",
         /*assignToStorageCall=*/
-        "llvm::copy($_value, $_storage.begin())",
+        "::llvm::copy($_value, $_storage.begin())",
         /*convertToAttributeCall=*/
-        "DenseI32ArrayAttr::get($_ctxt, $_storage)",
+        "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
         /*readFromMlirBytecodeCall=*/readBytecodeSegmentSize,
         /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSize,
         /*hashPropertyCall=*/
-        "llvm::hash_combine_range(std::begin($_storage), "
+        "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
         /*StringRef defaultValue=*/"");
   };
@@ -447,7 +447,7 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       operandSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
-      operandSegmentsSize = {"odsOperandSegmentSizes",
+      operandSegmentsSize = {"operandSegmentSizes",
                              makeProperty(operandSegmentsSizeStorage)};
     } else {
       attrMetadata.insert(
@@ -460,7 +460,7 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       resultSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
-      resultSegmentsSize = {"odsResultSegmentSizes",
+      resultSegmentsSize = {"resultSegmentSizes",
                             makeProperty(resultSegmentsSizeStorage)};
     } else {
       attrMetadata.insert(
@@ -868,10 +868,12 @@ while (true) {{
   if (useProperties) {
     for (const std::pair<StringRef, AttributeMetadata> &it :
          emitHelper.getAttrMetadata()) {
+      const AttributeMetadata &metadata = it.second;
+      if (metadata.constraint && metadata.constraint->isDerivedAttr())
+        continue;
       body << formatv(
           "auto tblgen_{0} = getProperties().{0}; (void)tblgen_{0};\n",
           it.first);
-      const AttributeMetadata &metadata = it.second;
       if (metadata.isRequired)
         body << formatv(
             "if (!tblgen_{0}) return {1}\"requires attribute '{0}'\");\n",
@@ -1306,10 +1308,12 @@ void OpEmitter::genPropertiesSupport() {
       std::string getAttr;
       llvm::raw_string_ostream os(getAttr);
       os << "   auto attr = dict.get(\"" << name << "\");";
-      if (name == "odsOperandSegmentSizes") {
+      if (name == operandSegmentAttrName) {
+        // Backward compat for now, TODO: Remove at some point.
         os << "   if (!attr) attr = dict.get(\"operand_segment_sizes\");";
       }
-      if (name == "odsResultSegmentSizes") {
+      if (name == resultSegmentAttrName) {
+        // Backward compat for now, TODO: Remove at some point.
         os << "   if (!attr) attr = dict.get(\"result_segment_sizes\");";
       }
       os.flush();
@@ -1327,10 +1331,12 @@ void OpEmitter::genPropertiesSupport() {
       std::string getAttr;
       llvm::raw_string_ostream os(getAttr);
       os << "   auto attr = dict.get(\"" << name << "\");";
-      if (name == "odsOperandSegmentSizes") {
+      if (name == operandSegmentAttrName) {
+        // Backward compat for now
         os << "   if (!attr) attr = dict.get(\"operand_segment_sizes\");";
       }
-      if (name == "odsResultSegmentSizes") {
+      if (name == resultSegmentAttrName) {
+        // Backward compat for now
         os << "   if (!attr) attr = dict.get(\"result_segment_sizes\");";
       }
       os.flush();
@@ -1445,7 +1451,7 @@ void OpEmitter::genPropertiesSupport() {
 )decl";
   const char *setInherentAttrMethodFmt = R"decl(
     if (name == "{0}") {{
-       prop.{0} = dyn_cast_or_null<std::remove_reference_t<decltype(prop.{0})>>(value);
+       prop.{0} = ::llvm::dyn_cast_or_null<std::remove_reference_t<decltype(prop.{0})>>(value);
        return;
     }
 )decl";
@@ -1466,39 +1472,39 @@ void OpEmitter::genPropertiesSupport() {
     // even though it is a native property.
     const auto *namedProperty = cast<const NamedProperty *>(attrOrProp);
     StringRef name = namedProperty->name;
-    if (name != "odsOperandSegmentSizes" && name != "odsResultSegmentSizes")
+    if (name != operandSegmentAttrName && name != resultSegmentAttrName)
       continue;
     auto &prop = namedProperty->prop;
     FmtContext fctx;
     fctx.addSubst("_ctxt", "ctx");
     fctx.addSubst("_storage", Twine("prop.") + name);
-    if (name == "odsOperandSegmentSizes") {
+    if (name == operandSegmentAttrName) {
       getInherentAttrMethod
-          << formatv("    if (name == \"odsOperandSegmentSizes\" || name == "
+          << formatv("    if (name == \"operand_segment_sizes\" || name == "
                      "\"{0}\") return ",
                      operandSegmentAttrName);
     } else {
       getInherentAttrMethod
-          << formatv("    if (name == \"odsResultSegmentSizes\" || name == "
+          << formatv("    if (name == \"result_segment_sizes\" || name == "
                      "\"{0}\") return ",
                      resultSegmentAttrName);
     }
     getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
                           << ";\n";
 
-    if (name == "odsOperandSegmentSizes") {
-      setInherentAttrMethod << formatv(
-          "        if (name == \"odsOperandSegmentSizes\" || name == "
-          "\"{0}\") {{",
-          operandSegmentAttrName);
+    if (name == operandSegmentAttrName) {
+      setInherentAttrMethod
+          << formatv("        if (name == \"operand_segment_sizes\" || name == "
+                     "\"{0}\") {{",
+                     operandSegmentAttrName);
     } else {
       setInherentAttrMethod
-          << formatv("        if (name == \"odsResultSegmentSizes\" || name == "
+          << formatv("        if (name == \"result_segment_sizes\" || name == "
                      "\"{0}\") {{",
                      resultSegmentAttrName);
     }
     setInherentAttrMethod << formatv(R"decl(
-       auto arrAttr = dyn_cast_or_null<DenseI32ArrayAttr>(value);
+       auto arrAttr = ::llvm::dyn_cast_or_null<::mlir::DenseI32ArrayAttr>(value);
        if (!arrAttr) return;
        if (arrAttr.size() != sizeof(prop.{0}) / sizeof(int32_t))
          return;
@@ -1507,7 +1513,7 @@ void OpEmitter::genPropertiesSupport() {
     }
 )decl",
                                      name);
-    if (name == "odsOperandSegmentSizes") {
+    if (name == operandSegmentAttrName) {
       populateInherentAttrsMethod
           << formatv("  attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
                      tgfmt(prop.getConvertToAttributeCall(), &fctx));
@@ -2015,7 +2021,7 @@ void OpEmitter::genNamedOperandGetters() {
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     if (op.getDialect().usePropertiesForAttributes())
       attrSizeInitCode = formatv(adapterSegmentSizeAttrInitCodeProperties,
-                                 "getProperties().odsOperandSegmentSizes");
+                                 "getProperties().operandSegmentSizes");
 
     else
       attrSizeInitCode = formatv(opSegmentSizeAttrInitCode,
@@ -2056,8 +2062,8 @@ void OpEmitter::genNamedOperandSetters() {
       if (emitHelper.hasProperties())
         body << formatv(", ::mlir::MutableOperandRange::OperandSegment({0}u, "
                         "{{getOperandSegmentSizesAttrName(), "
-                        "DenseI32ArrayAttr::get(getContext(), "
-                        "getProperties().odsOperandSegmentSizes)})",
+                        "::mlir::DenseI32ArrayAttr::get(getContext(), "
+                        "getProperties().operandSegmentSizes)})",
                         i);
       else
         body << formatv(
@@ -2116,7 +2122,7 @@ void OpEmitter::genNamedResultGetters() {
   if (attrSizedResults) {
     if (op.getDialect().usePropertiesForAttributes())
       attrSizeInitCode = formatv(adapterSegmentSizeAttrInitCodeProperties,
-                                 "getProperties().odsResultSegmentSizes");
+                                 "getProperties().resultSegmentSizes");
 
     else
       attrSizeInitCode = formatv(opSegmentSizeAttrInitCode,
@@ -2291,11 +2297,11 @@ void OpEmitter::genSeparateArgParamBuilder() {
              << ");\n";
       }
 
-      // Automatically create the 'result_segment_sizes' attribute using
+      // Automatically create the 'resultSegmentSizes' attribute using
       // the length of the type ranges.
       if (op.getTrait("::mlir::OpTrait::AttrSizedResultSegments")) {
         if (op.getDialect().usePropertiesForAttributes()) {
-          body << "  llvm::copy(ArrayRef<int32_t>({";
+          body << "  ::llvm::copy(::llvm::ArrayRef<int32_t>({";
         } else {
           std::string getterName = op.getGetterName(resultSegmentAttrName);
           body << " " << builderOpState << ".addAttribute(" << getterName
@@ -2321,7 +2327,7 @@ void OpEmitter::genSeparateArgParamBuilder() {
         if (op.getDialect().usePropertiesForAttributes()) {
           body << "}), " << builderOpState
                << ".getOrAddProperties<Properties>()."
-                  "odsResultSegmentSizes.begin());\n";
+                  "resultSegmentSizes.begin());\n";
         } else {
           body << "}));\n";
         }
@@ -2943,11 +2949,11 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     std::string sizes = op.getGetterName(operandSegmentAttrName);
     if (op.getDialect().usePropertiesForAttributes()) {
-      body << "  llvm::copy(ArrayRef<int32_t>({";
+      body << "  ::llvm::copy(::llvm::ArrayRef<int32_t>({";
       emitSegment();
       body << "}), " << builderOpState
            << ".getOrAddProperties<Properties>()."
-              "odsOperandSegmentSizes.begin());\n";
+              "operandSegmentSizes.begin());\n";
     } else {
       body << "  " << builderOpState << ".addAttribute(" << sizes << "AttrName("
            << builderOpState << ".name), "
@@ -3819,8 +3825,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
       if (attr) {
         storageType = attr->getStorageType();
       } else {
-        if (name != "odsOperandSegmentSizes" &&
-            name != "odsResultSegmentSizes") {
+        if (name != operandSegmentAttrName && name != resultSegmentAttrName) {
           report_fatal_error("unexpected AttributeMetadata");
         }
         // TODO: update to use native integers.
@@ -3935,7 +3940,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
     if (op.getDialect().usePropertiesForAttributes())
       sizeAttrInit =
           formatv(adapterSegmentSizeAttrInitCodeProperties,
-                  llvm::formatv("getProperties().odsOperandSegmentSizes"));
+                  llvm::formatv("getProperties().operandSegmentSizes"));
     else
       sizeAttrInit = formatv(adapterSegmentSizeAttrInitCode,
                              emitHelper.getAttr(operandSegmentAttrName));
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index c38f873ddaba4..546d4616f7173 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1662,14 +1662,14 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op,
           body << "1";
       };
       if (op.getDialect().usePropertiesForAttributes()) {
-        body << "llvm::copy(ArrayRef<int32_t>({";
+        body << "::llvm::copy(::llvm::ArrayRef<int32_t>({";
         llvm::interleaveComma(op.getOperands(), body, interleaveFn);
         body << formatv("}), "
                         "result.getOrAddProperties<{0}::Properties>()."
-                        "odsOperandSegmentSizes.begin());\n",
+                        "operandSegmentSizes.begin());\n",
                         op.getCppClassName());
       } else {
-        body << "  result.addAttribute(\"operand_segment_sizes\", "
+        body << "  result.addAttribute(\"operandSegmentSizes\", "
              << "parser.getBuilder().getDenseI32ArrayAttr({";
         llvm::interleaveComma(op.getOperands(), body, interleaveFn);
         body << "}));\n";
@@ -1710,10 +1710,10 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op,
       llvm::interleaveComma(op.getResults(), body, interleaveFn);
       body << formatv("}), "
                       "result.getOrAddProperties<{0}::Properties>()."
-                      "odsResultSegmentSizes.begin());\n",
+                      "resultSegmentSizes.begin());\n",
                       op.getCppClassName());
     } else {
-      body << "  result.addAttribute(\"result_segment_sizes\", "
+      body << "  result.addAttribute(\"resultSegmentSizes\", "
            << "parser.getBuilder().getDenseI32ArrayAttr({";
       llvm::interleaveComma(op.getResults(), body, interleaveFn);
       body << "}));\n";
@@ -1767,10 +1767,10 @@ static void genAttrDictPrinter(OperationFormat &fmt, Operator &op,
   // Elide the variadic segment size attributes if necessary.
   if (!fmt.allOperands &&
       op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments"))
-    body << "  elidedAttrs.push_back(\"operand_segment_sizes\");\n";
+    body << "  elidedAttrs.push_back(\"operandSegmentSizes\");\n";
   if (!fmt.allResultTypes &&
       op.getTrait("::mlir::OpTrait::AttrSizedResultSegments"))
-    body << "  elidedAttrs.push_back(\"result_segment_sizes\");\n";
+    body << "  elidedAttrs.push_back(\"resultSegmentSizes\");\n";
   for (const StringRef key : fmt.inferredAttributes.keys())
     body << "  elidedAttrs.push_back(\"" << key << "\");\n";
   for (const NamedAttribute *attr : fmt.usedAttributes)
diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
index dd6e52d300efe..7c7b991fb7b07 100644
--- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
@@ -170,7 +170,7 @@ constexpr const char *opVariadicSegmentTemplate = R"Py(
   def {0}(self):
     {1}_range = _ods_segmented_accessor(
          self.operation.{1}s,
-         self.operation.attributes["{1}_segment_sizes"], {2})
+         self.operation.attributes["{1}SegmentSizes"], {2})
     return {1}_range{3}
 )Py";
 
diff --git a/mlir/unittests/IR/AdaptorTest.cpp b/mlir/unittests/IR/AdaptorTest.cpp
index ec15d30875755..4a01d2c52b645 100644
--- a/mlir/unittests/IR/AdaptorTest.cpp
+++ b/mlir/unittests/IR/AdaptorTest.cpp
@@ -39,7 +39,7 @@ TEST(Adaptor, GenericAdaptorsOperandAccess) {
     // value from the value 0.
     SmallVector<std::optional<int>> v = {0, 4};
     OIListSimple::Properties prop;
-    prop.odsOperandSegmentSizes = {1, 0, 1};
+    prop.operandSegmentSizes = {1, 0, 1};
     OIListSimple::GenericAdaptor<ArrayRef<std::optional<int>>> d(v, {}, prop,
                                                                  {});
     EXPECT_EQ(d.getArg0(), 0);
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
index a2b58bf731976..e99d46e6d2643 100644
--- a/mlir/unittests/IR/DialectTest.cpp
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -136,4 +136,50 @@ TEST(Dialect, RepeatedDelayedRegistration) {
   EXPECT_TRUE(testDialectInterface != nullptr);
 }
 
+namespace {
+/// A dummy extension that increases a counter when being applied and
+/// recursively adds additional extensions.
+struct DummyExtension : DialectExtension<DummyExtension, TestDialect> {
+  DummyExtension(int *counter, int numRecursive)
+      : DialectExtension(), counter(counter), numRecursive(numRecursive) {}
+
+  void apply(MLIRContext *ctx, TestDialect *dialect) const final {
+    ++(*counter);
+    DialectRegistry nestedRegistry;
+    for (int i = 0; i < numRecursive; ++i)
+      nestedRegistry.addExtension(
+          std::make_unique<DummyExtension>(counter, /*numRecursive=*/0));
+    // Adding additional extensions may trigger a reallocation of the
+    // `extensions` vector in the dialect registry.
+    ctx->appendDialectRegistry(nestedRegistry);
+  }
+
+private:
+  int *counter;
+  int numRecursive;
+};
+} // namespace
+
+TEST(Dialect, NestedDialectExtension) {
+  DialectRegistry registry;
+  registry.insert<TestDialect>();
+
+  // Add an extension that adds 100 more extensions.
+  int counter1 = 0;
+  registry.addExtension(std::make_unique<DummyExtension>(&counter1, 100));
+  // Add one more extension. This should not crash.
+  int counter2 = 0;
+  registry.addExtension(std::make_unique<DummyExtension>(&counter2, 0));
+
+  // Load dialect and apply extensions.
+  MLIRContext context(registry);
+  Dialect *testDialect = context.getOrLoadDialect<TestDialect>();
+  ASSERT_TRUE(testDialect != nullptr);
+
+  // Extensions may be applied multiple times. Make sure that each expected
+  // extension was applied at least once.
+  EXPECT_GE(counter1, 101);
+  EXPECT_GE(counter2, 1);
+}
+
 } // namespace
diff --git a/mlir/unittests/Pass/PassManagerTest.cpp b/mlir/unittests/Pass/PassManagerTest.cpp
index 97349d681c3a0..70a679125c0ea 100644
--- a/mlir/unittests/Pass/PassManagerTest.cpp
+++ b/mlir/unittests/Pass/PassManagerTest.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
 #include "gtest/gtest.h"
 
@@ -144,4 +145,39 @@ TEST(PassManagerTest, InvalidPass) {
                "intend to nest?");
 }
 
+/// Simple pass to annotate a func::FuncOp with the results of analysis.
+struct InitializeCheckingPass
+    : public PassWrapper<InitializeCheckingPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitializeCheckingPass)
+  LogicalResult initialize(MLIRContext *ctx) final {
+    initialized = true;
+    return success();
+  }
+  bool initialized = false;
+
+  void runOnOperation() override {
+    if (!initialized) {
+      getOperation()->emitError() << "Pass isn't initialized!";
+      signalPassFailure();
+    }
+  }
+};
+
+TEST(PassManagerTest, PassInitialization) {
+  MLIRContext context;
+  context.allowUnregisteredDialects();
+
+  // Create a module
+  OwningOpRef<ModuleOp> module(ModuleOp::create(UnknownLoc::get(&context)));
+
+  // Instantiate and run our pass.
+  auto pm = PassManager::on<ModuleOp>(&context);
+  pm.addPass(std::make_unique<InitializeCheckingPass>());
+  EXPECT_TRUE(succeeded(pm.run(module.get())));
+
+  // Adding a second copy of the pass, we should also initialize it!
+  pm.addPass(std::make_unique<InitializeCheckingPass>());
+  EXPECT_TRUE(succeeded(pm.run(module.get())));
+}
+
 } // namespace
diff --git a/openmp/libomptarget/src/OmptCallback.cpp b/openmp/libomptarget/src/OmptCallback.cpp
index cd44d0903be9c..4882a762adbf6 100644
--- a/openmp/libomptarget/src/OmptCallback.cpp
+++ b/openmp/libomptarget/src/OmptCallback.cpp
@@ -71,7 +71,8 @@ static uint64_t createRegionId() {
 }
 
 void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                     size_t Size, void *Code) {
+                                     void **TgtPtrBegin, size_t Size,
+                                     void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
@@ -79,7 +80,7 @@ void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &TargetRegionOpId,
         ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
@@ -87,13 +88,14 @@ void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   }
 }
 
 void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                   size_t Size, void *Code) {
+                                   void **TgtPtrBegin, size_t Size,
+                                   void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
@@ -101,7 +103,7 @@ void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &TargetRegionOpId,
         ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   }
   endTargetDataOperation();
diff --git a/openmp/libomptarget/src/OmptInterface.h b/openmp/libomptarget/src/OmptInterface.h
index c3a52969bf80e..178cedacf4a58 100644
--- a/openmp/libomptarget/src/OmptInterface.h
+++ b/openmp/libomptarget/src/OmptInterface.h
@@ -47,12 +47,12 @@ static ompt_get_target_task_data_t ompt_get_target_task_data_fn;
 class Interface {
 public:
   /// Top-level function for invoking callback before device data allocation
-  void beginTargetDataAlloc(int64_t DeviceId, void *TgtPtrBegin, size_t Size,
-                            void *Code);
+  void beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
+                            void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback after device data allocation
-  void endTargetDataAlloc(int64_t DeviceId, void *TgtPtrBegin, size_t Size,
-                          void *Code);
+  void endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
+                          void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback before data submit
   void beginTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 276b7c9f499c5..1421408435c2c 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -561,12 +561,14 @@ __tgt_target_table *DeviceTy::loadBinary(void *Img) {
 
 void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
   /// RAII to establish tool anchors before and after data allocation
+  void *TargetPtr = nullptr;
   OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
                     RegionInterface.getCallbacks<ompt_target_data_alloc>(),
-                    RTLDeviceID, HstPtr, Size,
+                    RTLDeviceID, HstPtr, &TargetPtr, Size,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
-  return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
+  TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
+  return TargetPtr;
 }
 
 int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 7fb72e16088ce..d47f0a3458587 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -108,6 +108,21 @@ targetDataMapper(ident_t *Loc, int64_t DeviceId, int32_t ArgNum,
   TargetAsyncInfoTy TargetAsyncInfo(Device);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
 
+  /// RAII to establish tool anchors before and after data begin / end / update
+  OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
+                        TargetDataFunction == targetDataEnd ||
+                        TargetDataFunction == targetDataUpdate) &&
+                       "Encountered unexpected TargetDataFunction during "
+                       "execution of targetDataMapper");
+                auto CallbackFunctions =
+                    (TargetDataFunction == targetDataBegin)
+                        ? RegionInterface.getCallbacks<ompt_target_enter_data>()
+                    : (TargetDataFunction == targetDataEnd)
+                        ? RegionInterface.getCallbacks<ompt_target_exit_data>()
+                        : RegionInterface.getCallbacks<ompt_target_update>();
+                InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
+                                             OMPT_GET_RETURN_ADDRESS(0));)
+
   int Rc = OFFLOAD_SUCCESS;
   Rc = TargetDataFunction(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes,
                           ArgTypes, ArgNames, ArgMappers, AsyncInfo,
@@ -129,12 +144,6 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
                                            map_var_info_t *ArgNames,
                                            void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data begin
-  OMPT_IF_BUILT(InterfaceRAII TargetDataEnterRAII(
-                    RegionInterface.getCallbacks<ompt_target_enter_data>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
                                 ArgTypes, ArgNames, ArgMappers, targetDataBegin,
                                 "Entering OpenMP data region", "begin");
@@ -161,12 +170,6 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
                                          map_var_info_t *ArgNames,
                                          void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data end
-  OMPT_IF_BUILT(InterfaceRAII TargetDataExitRAII(
-                    RegionInterface.getCallbacks<ompt_target_exit_data>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
                                 ArgTypes, ArgNames, ArgMappers, targetDataEnd,
                                 "Exiting OpenMP data region", "end");
@@ -190,12 +193,6 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
                                             map_var_info_t *ArgNames,
                                             void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data update
-  OMPT_IF_BUILT(InterfaceRAII TargetDataUpdateRAII(
-                    RegionInterface.getCallbacks<ompt_target_update>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(
       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
       ArgMappers, targetDataUpdate, "Updating OpenMP data", "update");
@@ -295,7 +292,8 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   DeviceTy &Device = *PM->Devices[DeviceId];
   TargetAsyncInfoTy TargetAsyncInfo(Device);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
-  OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
+  /// RAII to establish tool anchors before and after target region
+  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
@@ -363,7 +361,8 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
     return OMP_TGT_FAIL;
   }
   DeviceTy &Device = *PM->Devices[DeviceId];
-  OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
+  /// RAII to establish tool anchors before and after target region
+  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
diff --git a/openmp/libomptarget/test/ompt/veccopy_data.c b/openmp/libomptarget/test/ompt/veccopy_data.c
new file mode 100644
index 0000000000000..5bbc47dc11a7d
--- /dev/null
+++ b/openmp/libomptarget/test/ompt/veccopy_data.c
@@ -0,0 +1,128 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-oldDriver
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+/*
+ * Example OpenMP program that registers EMI callbacks.
+ * Explicitly testing for an initialized device num and
+ * #pragma omp target [data enter / data exit / update]
+ * The latter with the addition of a nowait clause.
+ */
+
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+#include "register_emi.h"
+
+#define N 100000
+
+#pragma omp declare target
+int c[N];
+#pragma omp end declare target
+
+int main() {
+  int a[N];
+  int b[N];
+
+  int i;
+
+  for (i = 0; i < N; i++)
+    a[i] = 0;
+
+  for (i = 0; i < N; i++)
+    b[i] = i;
+
+  for (i = 0; i < N; i++)
+    c[i] = 0;
+
+#pragma omp target enter data map(to : a)
+#pragma omp target parallel for
+  {
+    for (int j = 0; j < N; j++)
+      a[j] = b[j];
+  }
+#pragma omp target exit data map(from : a)
+
+#pragma omp target parallel for map(alloc : c)
+  {
+    for (int j = 0; j < N; j++)
+      c[j] = 2 * j + 1;
+  }
+#pragma omp target update from(c) nowait
+#pragma omp barrier
+
+  int rc = 0;
+  for (i = 0; i < N; i++) {
+    if (a[i] != i) {
+      rc++;
+      printf("Wrong value: a[%d]=%d\n", i, a[i]);
+    }
+  }
+
+  for (i = 0; i < N; i++) {
+    if (c[i] != 2 * i + 1) {
+      rc++;
+      printf("Wrong value: c[%d]=%d\n", i, c[i]);
+    }
+  }
+
+  if (!rc)
+    printf("Success\n");
+
+  return rc;
+}
+
+/// CHECK-NOT: Callback Target EMI:
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Init:
+/// CHECK: Callback Load:
+/// CHECK: Callback Target EMI: kind=2 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback Target EMI: kind=2 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=3 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback Target EMI: kind=3 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
+/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=4 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback Target EMI: kind=4 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Fini:
diff --git a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
index 6fdcfdb035375..9d3498dc72d23 100644
--- a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
+++ b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
@@ -63,10 +63,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
@@ -82,10 +84,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi.c b/openmp/libomptarget/test/ompt/veccopy_emi.c
index f15dfb18da46f..5adf302bd1fff 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi.c
@@ -61,10 +61,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
@@ -81,10 +83,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi_map.c b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
index af0743f0369c5..edf08325c41ba 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
@@ -62,10 +62,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
@@ -82,10 +84,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0