From c6d6417a0b7cd3a855ae8dcc6dd1111e1759dd2a Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Tue, 22 Aug 2023 23:38:24 +0100
Subject: [PATCH 01/92] [compiler-rt] Use .globl for FreeBSD/NetBSD interceptor
 wrappers

On FreeBSD and NetBSD we don't use .weak due to differing semantics.
Currently we end up using no directive, which gives a local symbol,
whereas the closer thing to a weak symbol would be a global one. In
particular, both GNU and LLVM toolchains cannot handle a GOT-indirect
reference to a local symbol at a non-zero offset within a section on
AArch64 (see https://github.com/ARM-software/abi-aa/issues/217), and so
interceptors do not work on FreeBSD/arm64, failing to link with LLD.
Switching to .globl both works around this bug and more closely aligns
such non-weak platforms with weak ones.

Fixes https://github.com/llvm/llvm-project/issues/63418

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D158552

(cherry picked from commit 7e1afab1b1821550c5f8d0d6a50636236fa02e2c)
---
 compiler-rt/lib/interception/interception.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 078d33b61be31..069f73d276f3c 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -181,7 +181,7 @@ const interpose_substitution substitution_##func_name[]             \
 // FreeBSD's dynamic linker (incompliantly) gives non-weak symbols higher
 // priority than weak ones so weak aliases won't work for indirect calls
 // in position-independent (-fPIC / -fPIE) mode.
-#   define __ASM_WEAK_WRAPPER(func)
+#   define __ASM_WEAK_WRAPPER(func) ".globl " #func "\n"
 #  else
 #   define __ASM_WEAK_WRAPPER(func) ".weak " #func "\n"
 #  endif  // SANITIZER_FREEBSD || SANITIZER_NETBSD

From 2e17e9e261dd511398a399226ce9e69f3dbc5d3f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 21 Aug 2023 14:11:14 -0700
Subject: [PATCH 02/92] [RISCV] Check type size for lax conversions between RVV
 builtin types and VectorType::RVVFixedLengthDataVector.

This code was copied from SVE and modified for RVV. For SVE, there
is only one size for builtin types so they didn't need to check
the size. For RVV, due to LMUL there are 7 different sizes of builtin
types so we do need to check the size.

I'm not sure we should have lax vector conversions at all for RVV.
That appears to be contributing to https://github.com/llvm/llvm-project/issues/64404

This patch at least fixes the obvious correctness issue.
This should be backported to LLVM 17.

Reviewed By: jacquesguan

Differential Revision: https://reviews.llvm.org/D157130

(cherry picked from commit 33af2f131db71a18aefc5469129540e2097a537f)
---
 clang/lib/AST/ASTContext.cpp                   |  5 ++---
 .../Sema/riscv-rvv-lax-vector-conversions.c    | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 7acacd7bf4f50..76000156fece7 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -9612,9 +9612,8 @@ bool ASTContext::areLaxCompatibleRVVTypes(QualType FirstType,
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
-      // If __riscv_v_fixed_vlen != N do not allow GNU vector lax conversion.
-      if (VecTy->getVectorKind() == VectorType::GenericVector &&
-          getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
+      // If __riscv_v_fixed_vlen != N do not allow vector lax conversion.
+      if (getTypeSize(SecondType) != getRVVTypeSize(*this, BT))
         return false;
 
       // If -flax-vector-conversions=all is specified, the types are
diff --git a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
index ff3e028aa314d..8ab01620b82aa 100644
--- a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
+++ b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
@@ -2,8 +2,6 @@
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=integer -ffreestanding -fsyntax-only -verify=lax-vector-integer %s
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=all -ffreestanding -fsyntax-only -verify=lax-vector-all %s
 
-// lax-vector-all-no-diagnostics
-
 // REQUIRES: riscv-registered-target
 
 #define RVV_FIXED_ATTR __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)))
@@ -20,6 +18,8 @@ typedef __rvv_uint64m1_t vuint64m1_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float64m1_t vfloat64m1_t;
 
+typedef __rvv_int64m2_t vint64m2_t;
+
 typedef vfloat32m1_t rvv_fixed_float32m1_t RVV_FIXED_ATTR;
 typedef vint32m1_t rvv_fixed_int32m1_t RVV_FIXED_ATTR;
 typedef float gnu_fixed_float32m1_t GNU_FIXED_ATTR;
@@ -76,3 +76,17 @@ void gnu_allowed_with_all_lax_conversions() {
   // lax-vector-none-error@-1 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
 }
+
+void not_allowed() {
+  rvv_fixed_int32m1_t fi32m1;
+  vint64m2_t si64m2;
+
+  fi32m1 = si64m2;
+  // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  si64m2 = fi32m1;
+  // lax-vector-none-error@-1 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vint64m2_t' (aka '__rvv_int64m2_t') from incompatible type}}
+}

From 4d5feafb9dc0d7e9b12b116f07307085687c2e3d Mon Sep 17 00:00:00 2001
From: Michael Halkenhaeuser <MichaelGerald.Halkenhauser@amd.com>
Date: Wed, 16 Aug 2023 06:38:39 -0400
Subject: [PATCH 03/92] [OpenMP][OMPT] Fix reported target pointer for data
 alloc callback

This patch fixes: https://github.com/llvm/llvm-project/issues/64671
DataOp EMI callbacks would not report the correct target pointer.
This is now alleviated by passing a `void**` into the function which
emits the actual callback, then evaluating that pointer.

Note: Since this is only done after the pointer has been properly
updated, only `endpoint=2` callbacks will show a non-null value.

Reviewed By: dhruvachak, jdoerfert

Differential Revision: https://reviews.llvm.org/D157996

(cherry picked from commit 41f3626f8b300cef24c06d9e8b7cf53029a4330a)
---
 openmp/libomptarget/src/OmptCallback.cpp             | 12 +++++++-----
 openmp/libomptarget/src/OmptInterface.h              |  8 ++++----
 openmp/libomptarget/src/device.cpp                   |  6 ++++--
 .../libomptarget/test/ompt/veccopy_disallow_both.c   |  4 ++++
 openmp/libomptarget/test/ompt/veccopy_emi.c          |  4 ++++
 openmp/libomptarget/test/ompt/veccopy_emi_map.c      |  4 ++++
 6 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/openmp/libomptarget/src/OmptCallback.cpp b/openmp/libomptarget/src/OmptCallback.cpp
index cd44d0903be9c..4882a762adbf6 100644
--- a/openmp/libomptarget/src/OmptCallback.cpp
+++ b/openmp/libomptarget/src/OmptCallback.cpp
@@ -71,7 +71,8 @@ static uint64_t createRegionId() {
 }
 
 void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                     size_t Size, void *Code) {
+                                     void **TgtPtrBegin, size_t Size,
+                                     void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
@@ -79,7 +80,7 @@ void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &TargetRegionOpId,
         ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
@@ -87,13 +88,14 @@ void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   }
 }
 
 void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                   size_t Size, void *Code) {
+                                   void **TgtPtrBegin, size_t Size,
+                                   void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
@@ -101,7 +103,7 @@ void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &TargetRegionOpId,
         ompt_target_data_alloc, HstPtrBegin,
-        /* SrcDeviceNum */ omp_get_initial_device(), /* TgtPtrBegin */ nullptr,
+        /* SrcDeviceNum */ omp_get_initial_device(), *TgtPtrBegin,
         /* TgtDeviceNum */ DeviceId, Size, Code);
   }
   endTargetDataOperation();
diff --git a/openmp/libomptarget/src/OmptInterface.h b/openmp/libomptarget/src/OmptInterface.h
index c3a52969bf80e..178cedacf4a58 100644
--- a/openmp/libomptarget/src/OmptInterface.h
+++ b/openmp/libomptarget/src/OmptInterface.h
@@ -47,12 +47,12 @@ static ompt_get_target_task_data_t ompt_get_target_task_data_fn;
 class Interface {
 public:
   /// Top-level function for invoking callback before device data allocation
-  void beginTargetDataAlloc(int64_t DeviceId, void *TgtPtrBegin, size_t Size,
-                            void *Code);
+  void beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
+                            void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback after device data allocation
-  void endTargetDataAlloc(int64_t DeviceId, void *TgtPtrBegin, size_t Size,
-                          void *Code);
+  void endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
+                          void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback before data submit
   void beginTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 276b7c9f499c5..1421408435c2c 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -561,12 +561,14 @@ __tgt_target_table *DeviceTy::loadBinary(void *Img) {
 
 void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
   /// RAII to establish tool anchors before and after data allocation
+  void *TargetPtr = nullptr;
   OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
                     RegionInterface.getCallbacks<ompt_target_data_alloc>(),
-                    RTLDeviceID, HstPtr, Size,
+                    RTLDeviceID, HstPtr, &TargetPtr, Size,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
-  return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
+  TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
+  return TargetPtr;
 }
 
 int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
diff --git a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
index 6fdcfdb035375..9d3498dc72d23 100644
--- a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
+++ b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
@@ -63,10 +63,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
@@ -82,10 +84,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi.c b/openmp/libomptarget/test/ompt/veccopy_emi.c
index f15dfb18da46f..5adf302bd1fff 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi.c
@@ -61,10 +61,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
@@ -81,10 +83,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi_map.c b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
index af0743f0369c5..edf08325c41ba 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
@@ -62,10 +62,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
@@ -82,10 +84,12 @@ int main() {
 /// CHECK: Callback Target EMI: kind=1 endpoint=1
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=1
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
 /// CHECK: Callback DataOp EMI: endpoint=1 optype=2
 /// CHECK: Callback DataOp EMI: endpoint=2 optype=2
 /// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0

From 1d54dc2f75861295aeb99f480aed244dc5cedea5 Mon Sep 17 00:00:00 2001
From: Michael Halkenhaeuser <MichaelGerald.Halkenhauser@amd.com>
Date: Tue, 22 Aug 2023 12:40:55 -0400
Subject: [PATCH 04/92] [OpenMP][OMPT] Fix `target enter data` callback
 ordering & reported device num

This patch fixes: https://github.com/llvm/llvm-project/issues/64738
We observed multiple issues, primarily that the `DeviceId` was reported as -1
in certain scenarios. The reason for this is simply that the device is not
initialized at that point. Hence, we need to move the RAII object creation just
after the `checkDeviceAndCtors`, closer to the actual call we want to observe.

This also solves an odering issue where one `target enter data` callback would
be executed before the `Init` callback.
Additionally, this change will also fix that the callbacks corresponding to
`enter / exit data` and `update` in conjunction with `nowait` would not result
in the emission of an OMPT callback.

Added a testcase to cover initialized device number and `omp target` constructs.

Reviewed By: dhruvachak

Differential Revision: https://reviews.llvm.org/D157605

(cherry picked from commit 57f0bdc8fb1e66d4ed9cfb57f1ef699eefd99646)
---
 openmp/libomptarget/src/interface.cpp        |  39 +++---
 openmp/libomptarget/test/ompt/veccopy_data.c | 128 +++++++++++++++++++
 2 files changed, 147 insertions(+), 20 deletions(-)
 create mode 100644 openmp/libomptarget/test/ompt/veccopy_data.c

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 7fb72e16088ce..d47f0a3458587 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -108,6 +108,21 @@ targetDataMapper(ident_t *Loc, int64_t DeviceId, int32_t ArgNum,
   TargetAsyncInfoTy TargetAsyncInfo(Device);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
 
+  /// RAII to establish tool anchors before and after data begin / end / update
+  OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
+                        TargetDataFunction == targetDataEnd ||
+                        TargetDataFunction == targetDataUpdate) &&
+                       "Encountered unexpected TargetDataFunction during "
+                       "execution of targetDataMapper");
+                auto CallbackFunctions =
+                    (TargetDataFunction == targetDataBegin)
+                        ? RegionInterface.getCallbacks<ompt_target_enter_data>()
+                    : (TargetDataFunction == targetDataEnd)
+                        ? RegionInterface.getCallbacks<ompt_target_exit_data>()
+                        : RegionInterface.getCallbacks<ompt_target_update>();
+                InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
+                                             OMPT_GET_RETURN_ADDRESS(0));)
+
   int Rc = OFFLOAD_SUCCESS;
   Rc = TargetDataFunction(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes,
                           ArgTypes, ArgNames, ArgMappers, AsyncInfo,
@@ -129,12 +144,6 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
                                            map_var_info_t *ArgNames,
                                            void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data begin
-  OMPT_IF_BUILT(InterfaceRAII TargetDataEnterRAII(
-                    RegionInterface.getCallbacks<ompt_target_enter_data>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
                                 ArgTypes, ArgNames, ArgMappers, targetDataBegin,
                                 "Entering OpenMP data region", "begin");
@@ -161,12 +170,6 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
                                          map_var_info_t *ArgNames,
                                          void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data end
-  OMPT_IF_BUILT(InterfaceRAII TargetDataExitRAII(
-                    RegionInterface.getCallbacks<ompt_target_exit_data>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
                                 ArgTypes, ArgNames, ArgMappers, targetDataEnd,
                                 "Exiting OpenMP data region", "end");
@@ -190,12 +193,6 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
                                             map_var_info_t *ArgNames,
                                             void **ArgMappers) {
   TIMESCOPE_WITH_IDENT(Loc);
-  /// RAII to establish tool anchors before and after data update
-  OMPT_IF_BUILT(InterfaceRAII TargetDataUpdateRAII(
-                    RegionInterface.getCallbacks<ompt_target_update>(),
-                    DeviceId,
-                    /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
   targetDataMapper<AsyncInfoTy>(
       Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
       ArgMappers, targetDataUpdate, "Updating OpenMP data", "update");
@@ -295,7 +292,8 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   DeviceTy &Device = *PM->Devices[DeviceId];
   TargetAsyncInfoTy TargetAsyncInfo(Device);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
-  OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
+  /// RAII to establish tool anchors before and after target region
+  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
@@ -363,7 +361,8 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
     return OMP_TGT_FAIL;
   }
   DeviceTy &Device = *PM->Devices[DeviceId];
-  OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
+  /// RAII to establish tool anchors before and after target region
+  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
 
diff --git a/openmp/libomptarget/test/ompt/veccopy_data.c b/openmp/libomptarget/test/ompt/veccopy_data.c
new file mode 100644
index 0000000000000..5bbc47dc11a7d
--- /dev/null
+++ b/openmp/libomptarget/test/ompt/veccopy_data.c
@@ -0,0 +1,128 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-oldDriver
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+/*
+ * Example OpenMP program that registers EMI callbacks.
+ * Explicitly testing for an initialized device num and
+ * #pragma omp target [data enter / data exit / update]
+ * The latter with the addition of a nowait clause.
+ */
+
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+#include "register_emi.h"
+
+#define N 100000
+
+#pragma omp declare target
+int c[N];
+#pragma omp end declare target
+
+int main() {
+  int a[N];
+  int b[N];
+
+  int i;
+
+  for (i = 0; i < N; i++)
+    a[i] = 0;
+
+  for (i = 0; i < N; i++)
+    b[i] = i;
+
+  for (i = 0; i < N; i++)
+    c[i] = 0;
+
+#pragma omp target enter data map(to : a)
+#pragma omp target parallel for
+  {
+    for (int j = 0; j < N; j++)
+      a[j] = b[j];
+  }
+#pragma omp target exit data map(from : a)
+
+#pragma omp target parallel for map(alloc : c)
+  {
+    for (int j = 0; j < N; j++)
+      c[j] = 2 * j + 1;
+  }
+#pragma omp target update from(c) nowait
+#pragma omp barrier
+
+  int rc = 0;
+  for (i = 0; i < N; i++) {
+    if (a[i] != i) {
+      rc++;
+      printf("Wrong value: a[%d]=%d\n", i, a[i]);
+    }
+  }
+
+  for (i = 0; i < N; i++) {
+    if (c[i] != 2 * i + 1) {
+      rc++;
+      printf("Wrong value: c[%d]=%d\n", i, c[i]);
+    }
+  }
+
+  if (!rc)
+    printf("Success\n");
+
+  return rc;
+}
+
+/// CHECK-NOT: Callback Target EMI:
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Init:
+/// CHECK: Callback Load:
+/// CHECK: Callback Target EMI: kind=2 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback Target EMI: kind=2 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-NOT: dest=(nil)
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=3 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback Target EMI: kind=3 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
+/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Target EMI: kind=4 endpoint=1
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback Target EMI: kind=4 endpoint=2
+/// CHECK-NOT: device_num=-1
+/// CHECK: Callback Fini:

From 6d0ff3d27ced294d89775fda9ff4de2c6844d9e0 Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen@gmail.com>
Date: Tue, 15 Aug 2023 00:22:43 -0700
Subject: [PATCH 05/92] [CGCall][RISCV] Handle function calls with parameter of
 RVV tuple type

This was an oversight in D146872, where function calls with tuple type
was not covered. This commit fixes this.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D157953

(cherry picked from commit 39a41c8905618fc942100630b4ea37f0cb95e786)
---
 clang/lib/CodeGen/CGCall.cpp                  | 64 ++++++++++++-------
 .../rvv-tuple-type.c                          | 33 ++++++++++
 .../Transforms/SROA/scalable-vector-struct.ll | 31 +++++++++
 3 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index fcc1620f7a043..6b8af9bf18c1f 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5239,30 +5239,50 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && ArgInfo.getCanBeFlattened()) {
         llvm::Type *SrcTy = Src.getElementType();
-        uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
-
-        // If the source type is smaller than the destination type of the
-        // coerce-to logic, copy the source value into a temp alloca the size
-        // of the destination type to allow loading all of it. The bits past
-        // the source value are left undef.
-        if (SrcSize < DstSize) {
-          Address TempAlloca
-            = CreateTempAlloca(STy, Src.getAlignment(),
-                               Src.getName() + ".coerce");
-          Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
-          Src = TempAlloca;
+        llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        if (SrcTypeSize.isScalable()) {
+          assert(STy->containsHomogeneousScalableVectorTypes() &&
+                 "ABI only supports structure with homogeneous scalable vector "
+                 "type");
+          assert(SrcTypeSize == DstTypeSize &&
+                 "Only allow non-fractional movement of structure with "
+                 "homogeneous scalable vector type");
+          assert(NumIRArgs == STy->getNumElements());
+
+          llvm::Value *StoredStructValue =
+              Builder.CreateLoad(Src, Src.getName() + ".tuple");
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            llvm::Value *Extract = Builder.CreateExtractValue(
+                StoredStructValue, i, Src.getName() + ".extract" + Twine(i));
+            IRCallArgs[FirstIRArg + i] = Extract;
+          }
         } else {
-          Src = Src.withElementType(STy);
-        }
+          uint64_t SrcSize = SrcTypeSize.getFixedValue();
+          uint64_t DstSize = DstTypeSize.getFixedValue();
+
+          // If the source type is smaller than the destination type of the
+          // coerce-to logic, copy the source value into a temp alloca the size
+          // of the destination type to allow loading all of it. The bits past
+          // the source value are left undef.
+          if (SrcSize < DstSize) {
+            Address TempAlloca = CreateTempAlloca(STy, Src.getAlignment(),
+                                                  Src.getName() + ".coerce");
+            Builder.CreateMemCpy(TempAlloca, Src, SrcSize);
+            Src = TempAlloca;
+          } else {
+            Src = Src.withElementType(STy);
+          }
 
-        assert(NumIRArgs == STy->getNumElements());
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Address EltPtr = Builder.CreateStructGEP(Src, i);
-          llvm::Value *LI = Builder.CreateLoad(EltPtr);
-          if (ArgHasMaybeUndefAttr)
-            LI = Builder.CreateFreeze(LI);
-          IRCallArgs[FirstIRArg + i] = LI;
+          assert(NumIRArgs == STy->getNumElements());
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            Address EltPtr = Builder.CreateStructGEP(Src, i);
+            llvm::Value *LI = Builder.CreateLoad(EltPtr);
+            if (ArgHasMaybeUndefAttr)
+              LI = Builder.CreateFreeze(LI);
+            IRCallArgs[FirstIRArg + i] = LI;
+          }
         }
       } else {
         // In the simple case, just pass the coerced loaded value.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
index f4235795a8622..f8d755992eeac 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-tuple-type.c
@@ -90,3 +90,36 @@ void baz(__rvv_int32m1x2_t v_tuple) {
 __rvv_int32m1x2_t qux(__rvv_int32m1x2_t v_tuple) {
   return v_tuple;
 }
+
+// O0-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// O0-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// O0-NEXT:  entry:
+// O0-NEXT:    [[V_TUPLE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[V_TUPLE_ADDR:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[COERCE:%.*]] = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+// O0-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// O0-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], ptr [[V_TUPLE]], align 4
+// O0-NEXT:    [[V_TUPLE1:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[V_TUPLE1]], ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    [[TMP2:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[V_TUPLE_ADDR]], align 4
+// O0-NEXT:    store { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_TUPLE:%.*]] = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr [[COERCE]], align 4
+// O0-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 0
+// O0-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[COERCE_TUPLE]], 1
+// O0-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// O0-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+// AFTER_MEM2REG-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @quux
+// AFTER_MEM2REG-SAME: (<vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]]) #[[ATTR0]] {
+// AFTER_MEM2REG-NEXT:  entry:
+// AFTER_MEM2REG-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0]], 0
+// AFTER_MEM2REG-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1]], 1
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+// AFTER_MEM2REG-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+// AFTER_MEM2REG-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @qux(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+// AFTER_MEM2REG-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+//
+__rvv_int32m1x2_t quux(__rvv_int32m1x2_t v_tuple) {
+  return qux(v_tuple);
+}
diff --git a/llvm/test/Transforms/SROA/scalable-vector-struct.ll b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
index 92cd44d2b5ac3..1af4fbbd9254b 100644
--- a/llvm/test/Transforms/SROA/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/SROA/scalable-vector-struct.ll
@@ -20,3 +20,34 @@ define %struct.test @alloca(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y) {
   %val = load %struct.test, %struct.test* %addr, align 4
   ret %struct.test %val
 }
+
+
+define { <vscale x 2 x i32>, <vscale x 2 x i32> } @return_tuple(<vscale x 2 x i32> %v_tuple.coerce0, <vscale x 2 x i32> %v_tuple.coerce1) {
+; CHECK-LABEL: @return_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V_TUPLE_COERCE0:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V_TUPLE_COERCE1:%.*]], 1
+; CHECK-NEXT:    [[COERCE_EXTRACT0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 0
+; CHECK-NEXT:    [[COERCE_EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], 1
+; CHECK-NEXT:    [[CALL:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> [[COERCE_EXTRACT0]], <vscale x 2 x i32> [[COERCE_EXTRACT1]])
+; CHECK-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } [[CALL]]
+;
+entry:
+  %v_tuple = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %v_tuple.addr = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %coerce = alloca { <vscale x 2 x i32>, <vscale x 2 x i32> }, align 4
+  %0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %v_tuple.coerce0, 0
+  %1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %0, <vscale x 2 x i32> %v_tuple.coerce1, 1
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %1, ptr %v_tuple, align 4
+  %v_tuple1 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %v_tuple1, ptr %v_tuple.addr, align 4
+  %2 = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %v_tuple.addr, align 4
+  store { <vscale x 2 x i32>, <vscale x 2 x i32> } %2, ptr %coerce, align 4
+  %coerce.tuple = load { <vscale x 2 x i32>, <vscale x 2 x i32> }, ptr %coerce, align 4
+  %coerce.extract0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 0
+  %coerce.extract1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %coerce.tuple, 1
+  %call = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32> %coerce.extract0, <vscale x 2 x i32> %coerce.extract1)
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %call
+}
+
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @foo(<vscale x 2 x i32>, <vscale x 2 x i32>)

From 7dbb49af2aba685d8776730fb362448a963a2aa5 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 23 Aug 2023 16:37:21 +0800
Subject: [PATCH 06/92] [Coroutines] [CoroElide] Don't think exceptional
 terminator don't leak coro handle unconditionally any more

Close https://github.com/llvm/llvm-project/issues/59723.

The fundamental cause of the above issue is that we assumed the memory
of coroutine frame can be released by stack unwinding automatically
if the allocation of the coroutine frame is elided. But we missed one
point: the stack unwinding has different semantics with the explicit
coroutine_handle<>::destroy(). Since the latter is explicit so it shows
the intention of the user. So we can blame the user to destroy the
coroutine frame incorrectly in case of use-after-free happens. But we
can't do so with stack unwinding.

So after this patch, we won't think the exceptional terminator don't
leak the coroutine handle unconditionally. Instead, we think the
exceptional terminator will leak the coroutine handle too if the
coroutine is leaked somewhere along the search path.

Concretely for C++, we can think the exceptional terminator is not
special any more. Maybe this may cause some performance regressions.
But I've tested the motivating example (std::generator). And on the
other side, the coroutine elision is a middle end opitmization and not
a language feature. So we don't think we should blame such regressions
especially we are correcting the miscompilations.

(cherry picked from commit 7037331a2f05990cd59f35a7c0f6ce87c0f3cb5f)
---
 clang/test/CodeGenCoroutines/coro-halo.cpp   |   2 +
 clang/test/CodeGenCoroutines/pr59723.cpp     | 237 +++++++++++++++++++
 llvm/lib/Transforms/Coroutines/CoroElide.cpp |  83 +++++--
 3 files changed, 299 insertions(+), 23 deletions(-)
 create mode 100644 clang/test/CodeGenCoroutines/pr59723.cpp

diff --git a/clang/test/CodeGenCoroutines/coro-halo.cpp b/clang/test/CodeGenCoroutines/coro-halo.cpp
index 6244f130b7be2..e75bedaf81fa2 100644
--- a/clang/test/CodeGenCoroutines/coro-halo.cpp
+++ b/clang/test/CodeGenCoroutines/coro-halo.cpp
@@ -1,5 +1,7 @@
 // This tests that the coroutine heap allocation elision optimization could happen succesfully.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O2 -emit-llvm %s \
+// RUN:   -fcxx-exceptions -fexceptions -o - | FileCheck %s
 
 #include "Inputs/coroutine.h"
 #include "Inputs/numeric.h"
diff --git a/clang/test/CodeGenCoroutines/pr59723.cpp b/clang/test/CodeGenCoroutines/pr59723.cpp
new file mode 100644
index 0000000000000..7fc9995f417ac
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/pr59723.cpp
@@ -0,0 +1,237 @@
+// This is reduced test case from https://github.com/llvm/llvm-project/issues/59723.
+// This is not a minimal reproducer intentionally to check the compiler's ability.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fcxx-exceptions\
+// RUN:     -fexceptions -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+// executor and operation base
+
+class bug_any_executor;
+
+struct bug_async_op_base
+{
+	void invoke();
+
+protected:
+
+	~bug_async_op_base() = default;
+};
+
+class bug_any_executor
+{
+	using op_type = bug_async_op_base;
+
+public:
+
+	virtual ~bug_any_executor() = default;
+
+	// removing noexcept enables clang to find that the pointer has escaped
+	virtual void post(op_type& op) noexcept = 0;
+
+	virtual void wait() noexcept = 0;
+};
+
+class bug_thread_executor : public bug_any_executor
+{
+
+public:
+
+	void start()
+	{
+		
+	}
+
+	~bug_thread_executor()
+	{
+	}
+
+	// although this implementation is not realy noexcept due to allocation but I have a real one that is and required to be noexcept
+	virtual void post(bug_async_op_base& op) noexcept override;
+
+	virtual void wait() noexcept override
+	{
+		
+	}
+};
+
+// task and promise
+
+struct bug_final_suspend_notification
+{
+	virtual std::coroutine_handle<> get_waiter() = 0;
+};
+
+class bug_task;
+
+class bug_task_promise
+{
+	friend bug_task;
+public:
+
+	bug_task get_return_object() noexcept;
+
+	constexpr std::suspend_always initial_suspend() noexcept { return {}; }
+
+	std::suspend_always final_suspend() noexcept 
+	{
+		return {};
+	}
+
+	void unhandled_exception() noexcept;
+
+	constexpr void return_void() const noexcept {}
+
+	void get_result() const
+	{
+		
+	}
+};
+
+template <class T, class U>
+T exchange(T &&t, U &&u) {
+    T ret = t;
+    t = u;
+    return ret;
+}
+
+class bug_task
+{
+	friend bug_task_promise;
+	using handle = std::coroutine_handle<>;
+	using promise_t = bug_task_promise;
+
+	bug_task(handle coro, promise_t* p) noexcept : this_coro{ coro }, this_promise{ p }
+	{
+	
+	}
+
+public:
+	using promise_type = bug_task_promise;
+
+    bug_task(bug_task&& other) noexcept
+		: this_coro{ exchange(other.this_coro, nullptr) }, this_promise{ exchange(other.this_promise, nullptr) } { 
+		
+	}
+
+	~bug_task()
+	{
+		if (this_coro)
+			this_coro.destroy();
+	}
+
+	constexpr bool await_ready() const noexcept
+	{
+		return false;
+	}
+
+	handle await_suspend(handle waiter) noexcept
+	{
+		return this_coro;
+	}
+
+	void await_resume() 
+	{
+		return this_promise->get_result();
+	}
+
+	handle this_coro;
+	promise_t* this_promise;
+};
+
+bug_task bug_task_promise::get_return_object() noexcept
+{
+	return { std::coroutine_handle<bug_task_promise>::from_promise(*this), this };
+}
+
+// spawn operation and spawner
+
+template<class Handler>
+class bug_spawn_op final : public bug_async_op_base, bug_final_suspend_notification
+{
+	Handler handler;
+	bug_task task_;
+
+public:
+
+	bug_spawn_op(Handler handler, bug_task&& t)
+		: handler { handler }, task_{ static_cast<bug_task&&>(t) } {}
+
+	virtual std::coroutine_handle<> get_waiter() override
+	{
+		handler();
+		return std::noop_coroutine();
+	}
+};
+
+class bug_spawner;
+
+struct bug_spawner_awaiter
+{
+	bug_spawner& s;
+	std::coroutine_handle<> waiter;
+
+	bug_spawner_awaiter(bug_spawner& s) : s{ s } {}
+
+	bool await_ready() const noexcept;
+
+	void await_suspend(std::coroutine_handle<> coro);
+
+	void await_resume() {}
+};
+
+class bug_spawner
+{
+	friend bug_spawner_awaiter;
+
+	struct final_handler_t
+	{
+		bug_spawner& s;
+
+		void operator()()
+		{
+			s.awaiter_->waiter.resume();
+		}
+	};
+
+public:
+
+	bug_spawner(bug_any_executor& ex) : ex_{ ex } {}
+
+	void spawn(bug_task&& t) {
+		using op_t = bug_spawn_op<final_handler_t>;
+		// move task into ptr
+		op_t* ptr = new op_t(final_handler_t{ *this }, static_cast<bug_task&&>(t));
+		++count_;
+		ex_.post(*ptr); // ptr escapes here thus task escapes but clang can't deduce that unless post() is not noexcept
+	}
+
+	bug_spawner_awaiter wait() noexcept { return { *this }; }
+
+private:
+	bug_any_executor& ex_; // if bug_thread_executor& is used instead enables clang to detect the escape of the promise
+	bug_spawner_awaiter* awaiter_ = nullptr;
+	unsigned count_ = 0;
+};
+
+// test case
+
+bug_task bug_spawned_task(int id, int inc)
+{
+	co_return;
+}
+
+struct A {
+    A();
+};
+
+void throwing_fn(bug_spawner& s) {
+	s.spawn(bug_spawned_task(1, 2));
+    throw A{};
+}
+
+// Check that the coroutine frame of bug_spawned_task are allocated from operator new.
+// CHECK: define{{.*}}@_Z11throwing_fnR11bug_spawner
+// CHECK-NOT: alloc
+// CHECK: %[[CALL:.+]] = {{.*}}@_Znwm(i64{{.*}} 24)
+// CHECK: store ptr @_Z16bug_spawned_taskii.resume, ptr %[[CALL]]
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index d78ab1c1ea284..d0606c15f3d5b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -194,12 +194,49 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
   for (auto *DA : It->second)
     Visited.insert(DA->getParent());
 
+  SmallPtrSet<const BasicBlock *, 32> EscapingBBs;
+  for (auto *U : CB->users()) {
+    // The use from coroutine intrinsics are not a problem.
+    if (isa<CoroFreeInst, CoroSubFnInst, CoroSaveInst>(U))
+      continue;
+
+    // Think all other usages may be an escaping candidate conservatively.
+    //
+    // Note that the major user of switch ABI coroutine (the C++) will store
+    // resume.fn, destroy.fn and the index to the coroutine frame immediately.
+    // So the parent of the coro.begin in C++ will be always escaping.
+    // Then we can't get any performance benefits for C++ by improving the
+    // precision of the method.
+    //
+    // The reason why we still judge it is we want to make LLVM Coroutine in
+    // switch ABIs to be self contained as much as possible instead of a
+    // by-product of C++20 Coroutines.
+    EscapingBBs.insert(cast<Instruction>(U)->getParent());
+  }
+
+  bool PotentiallyEscaped = false;
+
   do {
     const auto *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (TIs.count(BB))
-      return true;
+
+    // A Path insensitive marker to test whether the coro.begin escapes.
+    // It is intentional to make it path insensitive while it may not be
+    // precise since we don't want the process to be too slow.
+    PotentiallyEscaped |= EscapingBBs.count(BB);
+
+    if (TIs.count(BB)) {
+      if (!BB->getTerminator()->isExceptionalTerminator() || PotentiallyEscaped)
+        return true;
+
+      // If the function ends with the exceptional terminator, the memory used
+      // by the coroutine frame can be released by stack unwinding
+      // automatically. So we can think the coro.begin doesn't escape if it
+      // exits the function by exceptional terminator.
+
+      continue;
+    }
 
     // Conservatively say that there is potentially a path.
     if (!--Limit)
@@ -236,36 +273,36 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // memory location storing that value and not the virtual register.
 
   SmallPtrSet<BasicBlock *, 8> Terminators;
-  // First gather all of the non-exceptional terminators for the function.
+  // First gather all of the terminators for the function.
   // Consider the final coro.suspend as the real terminator when the current
   // function is a coroutine.
-    for (BasicBlock &B : *F) {
-      auto *TI = B.getTerminator();
-      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-          !isa<UnreachableInst>(TI))
-        Terminators.insert(&B);
-    }
+  for (BasicBlock &B : *F) {
+    auto *TI = B.getTerminator();
+
+    if (TI->getNumSuccessors() != 0 || isa<UnreachableInst>(TI))
+      continue;
+
+    Terminators.insert(&B);
+  }
 
   // Filter out the coro.destroy that lie along exceptional paths.
   SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
   for (const auto &It : DestroyAddr) {
-    // If there is any coro.destroy dominates all of the terminators for the
-    // coro.begin, we could know the corresponding coro.begin wouldn't escape.
-    for (Instruction *DA : It.second) {
-      if (llvm::all_of(Terminators, [&](auto *TI) {
-            return DT.dominates(DA, TI->getTerminator());
-          })) {
-        ReferencedCoroBegins.insert(It.first);
-        break;
-      }
-    }
-
-    // Whether there is any paths from coro.begin to Terminators which not pass
-    // through any of the coro.destroys.
+    // If every terminators is dominated by coro.destroy, we could know the
+    // corresponding coro.begin wouldn't escape.
+    //
+    // Otherwise hasEscapePath would decide whether there is any paths from
+    // coro.begin to Terminators which not pass through any of the
+    // coro.destroys.
     //
     // hasEscapePath is relatively slow, so we avoid to run it as much as
     // possible.
-    if (!ReferencedCoroBegins.count(It.first) &&
+    if (llvm::all_of(Terminators,
+                     [&](auto *TI) {
+                       return llvm::any_of(It.second, [&](auto *DA) {
+                         return DT.dominates(DA, TI->getTerminator());
+                       });
+                     }) ||
         !hasEscapePath(It.first, Terminators))
       ReferencedCoroBegins.insert(It.first);
   }

From a4bf0d98670d3d7694ee8f065bed68de8d8f1f75 Mon Sep 17 00:00:00 2001
From: Denis Revunov <revunov.denis@huawei-partners.com>
Date: Tue, 4 Jul 2023 12:25:28 +0300
Subject: [PATCH 07/92] [BOLT][Instrumentation] Keep profile open in
 WatchProcess

When a binary is instrumented with --instrumentation-sleep-time and
instrumentation-wait-forks options and lauched, the profile is
periodically written until all the forks die. The problem is that we
cannot wait for the whole process tree, and we have no way to tell when
it's safe to read the profile. Hovewer, if we keep profile open
throughout the life of the process tree, we can use fuser to determine
when writing is finished.

Reviewed By: rafauler

Differential Revision: https://reviews.llvm.org/D154436

(cherry picked from commit a799298152e3ef08b4919cdaac7a614f7cca9bc6)
---
 bolt/runtime/common.h  | 10 ++++++++++
 bolt/runtime/instr.cpp | 24 +++++++++++++++---------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
index 2ec6f45fc1595..5cb1298b8a5b7 100644
--- a/bolt/runtime/common.h
+++ b/bolt/runtime/common.h
@@ -458,6 +458,16 @@ uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
   return ret;
 }
 
+int __ftruncate(uint64_t fd, uint64_t length) {
+  int ret;
+  __asm__ __volatile__("movq $77, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(length)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
 int __close(uint64_t fd) {
   uint64_t ret;
   __asm__ __volatile__("movq $3, %%rax\n"
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index ef55ded40431c..147cad023290c 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1515,11 +1515,15 @@ extern "C" void __bolt_instr_clear_counters() {
 ///    on demand.
 ///
 extern "C" void __attribute((force_align_arg_pointer))
-__bolt_instr_data_dump() {
+__bolt_instr_data_dump(int FD) {
   // Already dumping
   if (!GlobalWriteProfileMutex->acquire())
     return;
 
+  int ret = __lseek(FD, 0, SEEK_SET);
+  assert(ret == 0, "Failed to lseek!");
+  ret = __ftruncate(FD, 0);
+  assert(ret == 0, "Failed to ftruncate!");
   BumpPtrAllocator HashAlloc;
   HashAlloc.setMaxSize(0x6400000);
   ProfileWriterContext Ctx = readDescriptions();
@@ -1527,8 +1531,6 @@ __bolt_instr_data_dump() {
 
   DEBUG(printStats(Ctx));
 
-  int FD = openProfile();
-
   BumpPtrAllocator Alloc;
   Alloc.setMaxSize(0x6400000);
   const uint8_t *FuncDesc = Ctx.FuncDescriptions;
@@ -1544,7 +1546,6 @@ __bolt_instr_data_dump() {
   Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
 
   __fsync(FD);
-  __close(FD);
   __munmap(Ctx.MMapPtr, Ctx.MMapSize);
   __close(Ctx.FileDesc);
   HashAlloc.destroy();
@@ -1557,6 +1558,7 @@ __bolt_instr_data_dump() {
 void watchProcess() {
   timespec ts, rem;
   uint64_t Ellapsed = 0ull;
+  int FD = openProfile();
   uint64_t ppid;
   if (__bolt_instr_wait_forks) {
     // Store parent pgid
@@ -1568,7 +1570,7 @@ void watchProcess() {
     ppid = __getppid();
     if (ppid == 1) {
       // Parent already dead
-      __bolt_instr_data_dump();
+      __bolt_instr_data_dump(FD);
       goto out;
     }
   }
@@ -1581,7 +1583,7 @@ void watchProcess() {
     // so no need for us to keep dumping.
     if (__kill(ppid, 0) < 0) {
       if (__bolt_instr_no_counters_clear)
-        __bolt_instr_data_dump();
+        __bolt_instr_data_dump(FD);
       break;
     }
 
@@ -1589,13 +1591,14 @@ void watchProcess() {
       continue;
 
     Ellapsed = 0;
-    __bolt_instr_data_dump();
+    __bolt_instr_data_dump(FD);
     if (__bolt_instr_no_counters_clear == false)
       __bolt_instr_clear_counters();
   }
 
 out:;
   DEBUG(report("My parent process is dead, bye!\n"));
+  __close(FD);
   __exit(0);
 }
 
@@ -1691,8 +1694,11 @@ extern "C" __attribute((naked)) void __bolt_instr_start()
 /// This is hooking into ELF's DT_FINI
 extern "C" void __bolt_instr_fini() {
   __bolt_fini_trampoline();
-  if (__bolt_instr_sleep_time == 0)
-    __bolt_instr_data_dump();
+  if (__bolt_instr_sleep_time == 0) {
+    int FD = openProfile();
+    __bolt_instr_data_dump(FD);
+    __close(FD);
+  }
   DEBUG(report("Finished.\n"));
 }
 

From 1856972dfe1d9871697af131759ab1e70402c847 Mon Sep 17 00:00:00 2001
From: Karl-Johan Johnsson <kalle@kjjohnsson.se>
Date: Sun, 13 Aug 2023 23:55:31 +0300
Subject: [PATCH 08/92] [MachineLICM][WinEH] Don't hoist register reloads out
 of funclets

This fixes https://github.com/llvm/llvm-project/issues/60766

With MSVC style exception-handling (funclets), no registers are
alive when entering the funclet so they must be reloaded from the
stack.  MachineLICM can sometimes hoist such reloads out of the
funclet which is not correct, the register will have been clobbered
when entering the funclet.  This can happen in any loop that
contains a try-catch.

This has been tested on x86_64-pc-window-msvc.  I'm not sure if
funclets work the same on the other windows archs.

Reviewed By: rnk, arsenm

Differential Revision: https://reviews.llvm.org/D153337

(cherry picked from commit 917574d5d8beacbefab4d0e6469dac5b50117832)
---
 llvm/lib/CodeGen/MachineLICM.cpp              |   4 +
 .../CodeGen/X86/machine-licm-vs-wineh.mir     | 141 ++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 4e80e9b58c060..523e077fd9a28 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -538,6 +538,10 @@ void MachineLICMBase::HoistRegionPostRA() {
         PhysRegDefs.set(*AI);
     }
 
+    // Funclet entry blocks will clobber all registers
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
+      PhysRegClobbers.setBitsNotInMask(Mask);
+
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
       ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
diff --git a/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir b/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir
new file mode 100644
index 0000000000000..4bfd749fb7723
--- /dev/null
+++ b/llvm/test/CodeGen/X86/machine-licm-vs-wineh.mir
@@ -0,0 +1,141 @@
+# RUN: llc -o - %s -mtriple=x86_64-pc-windows-msvc -run-pass=machinelicm | FileCheck %s
+#
+# This test checks that MachineLICM doesn't hoist loads out of funclets.
+# Manually modified from the IR of the following C++ function by running
+# llc -stop-after=machine-cp.
+#
+# void may_throw();
+# void use(int);
+#
+# void test(int n, int arg)
+# {
+#    for (int i = 0 ; i < n ; i++)
+#        try {
+#            may_throw();
+#        }
+#        catch (...) {
+#            // Two uses to get 'arg' allocated to a register
+#            use(arg);
+#            use(arg);
+#        }
+# }
+
+--- |
+  target triple = "x86_64-pc-windows-msvc"
+
+  define void @test(i32 %n, i32 %arg) personality ptr @__CxxFrameHandler3 {
+  entry:
+    %cmp3 = icmp sgt i32 %n, 0
+    br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.inc, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.inc
+    %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.inc ]
+    invoke void @may_throw()
+            to label %for.inc unwind label %catch.dispatch
+
+  catch.dispatch:                                   ; preds = %for.body
+    %0 = catchswitch within none [label %catch] unwind to caller
+
+  catch:                                            ; preds = %catch.dispatch
+    %1 = catchpad within %0 [ptr null, i32 64, ptr null]
+    call void @use(i32 %arg) [ "funclet"(token %1) ]
+    call void @use(i32 %arg) [ "funclet"(token %1) ]
+    catchret from %1 to label %for.inc
+
+  for.inc:                                          ; preds = %catch, %for.body
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+  declare i32 @__CxxFrameHandler3(...)
+
+  declare void @may_throw()
+
+  declare void @use(i32)
+
+...
+---
+name:            test
+alignment:       16
+tracksRegLiveness: true
+hasEHCatchret:   true
+hasEHScopes:     true
+hasEHFunclets:   true
+debugInstrRef:   true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$ecx' }
+  - { reg: '$edx' }
+frameInfo:
+  maxAlignment:    8
+  hasCalls:        true
+  hasOpaqueSPAdjustment: true
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $ecx, $edx
+
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, $edx :: (store (s32) into %stack.1)
+    TEST32rr renamable $ecx, renamable $ecx, implicit-def $eflags
+    JCC_1 %bb.2, 14, implicit killed $eflags
+
+  bb.1:
+    liveins: $ecx
+
+    JMP_1 %bb.3
+
+  bb.2.for.cond.cleanup:
+    RET 0
+
+  bb.3.for.body:
+    successors: %bb.5, %bb.4
+    liveins: $ecx
+
+    EH_LABEL <mcsymbol .Leh1>
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $ecx :: (store (s32) into %stack.0)
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @may_throw, csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    EH_LABEL <mcsymbol .Leh2>
+    JMP_1 %bb.5
+
+  bb.4.catch (landing-pad, ehfunclet-entry):
+    successors: %bb.5
+
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    renamable $esi = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    $ecx = COPY renamable $esi
+    CALL64pcrel32 @use, csr_win64, implicit $rsp, implicit $ssp, implicit $ecx, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ADJCALLSTACKDOWN64 32, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $ecx = COPY killed renamable $esi
+    CALL64pcrel32 @use, csr_win64, implicit $rsp, implicit $ssp, implicit $ecx, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CATCHRET %bb.5, %bb.0
+
+  bb.5.for.inc:
+    successors: %bb.2, %bb.3
+
+    renamable $ecx = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    renamable $ecx = DEC32r killed renamable $ecx, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit killed $eflags
+    JMP_1 %bb.3
+
+...
+#
+# CHECK: bb.4.catch
+# CHECK: ADJCALLSTACKDOWN64
+# CHECK-NEXT: renamable [[REG:\$[a-z0-9]+]] = MOV32rm %stack.1
+# CHECK-NEXT: $ecx = COPY renamable [[REG]]
+# CHECK-NEXT: CALL64pcrel32 @use

From dcb7bcff276664c12e1eb05e19e3e845a76b7116 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 18 Aug 2023 09:44:09 -0400
Subject: [PATCH 09/92] clang: Fix x86-no-gather-no-scatter.cpp on macOS after
 993bdb047c90e9

On macOS, files are usually below `/Users/...` and clang-cl treats
that as the `/U` flag followed by something instead of as a path.
Put `--` in front of `%s` to make it treat it as a patch, like in
all other tests.

The failure without this change:

    x86-no-gather-no-scatter.cpp:4:14: error: NOGATHER: expected string not found in input
    // NOGATHER: "-target-feature" "+prefer-no-gather"
                 ^
    <stdin>:5:44: note: possible intended match here
    clang: warning: 'x86-no-gather-no-scatter.cpp' treated as the '/U' option [-Wslash-u-filename]
                                               ^

(cherry picked from commit 547ee1c81fceaabcb7064ed525f11f9e94083f56)
---
 clang/test/Driver/x86-no-gather-no-scatter.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/x86-no-gather-no-scatter.cpp b/clang/test/Driver/x86-no-gather-no-scatter.cpp
index 7efcc55787c42..63611227bd583 100644
--- a/clang/test/Driver/x86-no-gather-no-scatter.cpp
+++ b/clang/test/Driver/x86-no-gather-no-scatter.cpp
@@ -1,8 +1,8 @@
 /// Tests -mno-gather and -mno-scatter
-// RUN: %clang -c -mno-gather -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
-// RUN: %clang_cl -c /Qgather- -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -c -mno-gather -### %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
+// RUN: %clang_cl --target=x86_64-windows -c /Qgather- -### -- %s 2>&1 | FileCheck --check-prefix=NOGATHER %s
 // NOGATHER: "-target-feature" "+prefer-no-gather"
 
-// RUN: %clang -c -mno-scatter -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
-// RUN: %clang_cl -c /Qscatter- -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -c -mno-scatter -### %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
+// RUN: %clang_cl --target=x86_64-windows -c /Qscatter- -### -- %s 2>&1 | FileCheck --check-prefix=NOSCATTER %s
 // NOSCATTER: "-target-feature" "+prefer-no-scatter"

From f1d5ea362577a8a1b5fafe775cf82a449daa3b07 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 21 Aug 2023 12:20:37 +0700
Subject: [PATCH 10/92] [clang] Set FP options in Sema when instantiating
 CompoundStmt

When an expression is instantiated, TreeTransform skips ImplicitCastExpr
nodes, assuming they are recreated when the instantiated expression is
built. It breaks functions that use non-default floating-point options,
because they are kept in these ImplicitCastExprs. In this case the
recreated ImplicitCastExpr takes FP options from the current Sema state
and not from AST node.

To fix this issue the FP options in Sema object are set when a compound
statement is cloned in TreeTransform.

This change fixes https://github.com/llvm/llvm-project/issues/64605
([Regression 16 -> 17] Template instantiation ignores FENV_ACCESS being
ON for both definition and instantiation).

Differential Revision: https://reviews.llvm.org/D158158

(cherry picked from commit 0baf85c331090fbe2d2b42214ed0664d55feb0b5)
---
 clang/lib/Sema/TreeTransform.h        |  4 ++++
 clang/test/SemaCXX/template-64605.cpp | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 clang/test/SemaCXX/template-64605.cpp

diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 10b3587885e39..097e81ea7d45a 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7478,6 +7478,10 @@ StmtResult
 TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
                                               bool IsStmtExpr) {
   Sema::CompoundScopeRAII CompoundScope(getSema());
+  Sema::FPFeaturesStateRAII FPSave(getSema());
+  if (S->hasStoredFPFeatures())
+    getSema().resetFPOptions(
+        S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
   const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
new file mode 100644
index 0000000000000..b13acbf2ae566
--- /dev/null
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+
+// https://github.com/llvm/llvm-project/issues/64605
+
+#pragma STDC FENV_ACCESS ON
+template <typename>
+int b_64605() {
+  int x;
+  if ((float)0xFFFFFFFF != (float)0x100000000) {
+    x = 1;
+  }
+  return x;
+}
+int f() { return b_64605<void>(); }
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
+
+// CHECK:      FunctionDecl {{.*}} b_64605 'int ()' implicit_instantiation
+// CHECK-NEXT: TemplateArgument type 'void'
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295

From e54f48384bb213f2c204c74d4e7e08a13904a9d6 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 21 Aug 2023 13:20:22 +0700
Subject: [PATCH 11/92] [clang] Run test for concrete target

The test clang/test/SemaCXX/template-64605.cpp uses pragma FENV_ACCESS,
which is not supported on all targets. Restrict it to x86_64 only.

(cherry picked from commit 73e5a70e676850b79f196e01e2194a2485041584)
---
 clang/test/SemaCXX/template-64605.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
index b13acbf2ae566..99ccbfdc27f1c 100644
--- a/clang/test/SemaCXX/template-64605.cpp
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
 
 // https://github.com/llvm/llvm-project/issues/64605
 

From 5db0d770c7e8739a56dc8273ed2481041813153a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 9 Aug 2023 12:47:13 -0700
Subject: [PATCH 12/92] Finish renaming getOperandSegmentSizeAttr() from
 `operand_segment_sizes` to `operandSegmentSizes`

This renaming started with the native ODS support for properties, this is completing it.

A mass automated textual rename seems safe for most codebases.
Drop also the ods prefix to keep the accessors the same as they were before
this change:
 properties.odsOperandSegmentSizes
reverts back to:
 properties.operandSegementSizes

The ODS prefix was creating divergence between all the places and make it harder to
be consistent.

Reviewed By: jpienaar

Differential Revision: https://reviews.llvm.org/D157173
---
 flang/lib/Optimizer/Dialect/FIROps.cpp        |  7 +--
 .../Fir/convert-to-llvm-openmp-and-fir.fir    | 22 +++----
 flang/test/Fir/convert-to-llvm.fir            |  8 +--
 mlir/docs/PatternRewriter.md                  |  2 +-
 mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td  | 24 ++++++++
 .../Dialect/Linalg/IR/LinalgInterfaces.td     | 17 ------
 mlir/include/mlir/IR/OpBase.td                |  4 +-
 mlir/include/mlir/IR/OpDefinition.h           |  8 +--
 mlir/lib/Bindings/Python/IRCore.cpp           | 16 ++---
 .../AMDGPU/Transforms/EmulateAtomics.cpp      |  4 +-
 mlir/lib/Dialect/Async/IR/Async.cpp           |  6 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 12 ++--
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |  8 +--
 mlir/lib/Rewrite/ByteCode.cpp                 |  4 +-
 mlir/test/Bytecode/operand_segment_sizes.mlir |  4 +-
 .../OpenMPToLLVM/convert-to-llvmir.mlir       |  4 +-
 mlir/test/Dialect/GPU/invalid.mlir            |  8 +--
 mlir/test/Dialect/LLVMIR/invalid.mlir         |  2 +-
 mlir/test/Dialect/Linalg/named-ops.mlir       |  6 +-
 mlir/test/Dialect/OpenMP/invalid.mlir         | 36 +++++------
 mlir/test/Dialect/OpenMP/ops.mlir             | 36 +++++------
 mlir/test/Dialect/PDL/invalid.mlir            | 10 ++--
 mlir/test/Dialect/PDLInterp/invalid.mlir      |  2 +-
 mlir/test/Dialect/SCF/invalid.mlir            |  6 +-
 .../Dialect/SPIRV/IR/control-flow-ops.mlir    |  4 +-
 mlir/test/Dialect/Transform/ops-invalid.mlir  |  2 +-
 mlir/test/IR/parser.mlir                      |  2 +-
 mlir/test/IR/traits.mlir                      | 48 +++++++--------
 mlir/test/Rewrite/pdl-bytecode.mlir           |  4 +-
 mlir/test/Target/LLVMIR/omptarget-llvm.mlir   |  6 +-
 .../LLVMIR/omptarget-region-device-llvm.mlir  |  6 +-
 .../Target/LLVMIR/omptarget-region-llvm.mlir  |  6 +-
 .../omptarget-region-parallel-llvm.mlir       |  6 +-
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      | 12 ++--
 .../Transforms/canonicalize-block-merge.mlir  |  2 +-
 mlir/test/Transforms/sccp.mlir                |  2 +-
 mlir/test/mlir-tblgen/op-decl-and-defs.td     |  2 +-
 mlir/test/mlir-tblgen/op-python-bindings.td   | 12 ++--
 mlir/test/python/dialects/linalg/ops.py       |  2 +-
 mlir/test/python/dialects/ods_helpers.py      | 12 ++--
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp   | 59 ++++++++++---------
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        | 12 ++--
 mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp |  2 +-
 mlir/unittests/IR/AdaptorTest.cpp             |  2 +-
 44 files changed, 233 insertions(+), 226 deletions(-)

diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 61ba0f584ae6d..960fc62190f5f 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -128,9 +128,8 @@ static mlir::ParseResult parseAllocatableOp(FN wrapResultType,
     parser.emitError(parser.getNameLoc(), "invalid allocate type: ") << intype;
     return mlir::failure();
   }
-  result.addAttribute(
-      "operand_segment_sizes",
-      builder.getDenseI32ArrayAttr({typeparamsSize, shapeSize}));
+  result.addAttribute("operandSegmentSizes", builder.getDenseI32ArrayAttr(
+                                                 {typeparamsSize, shapeSize}));
   if (parser.parseOptionalAttrDict(result.attributes) ||
       parser.addTypeToList(restype, result.types))
     return mlir::failure();
@@ -149,7 +148,7 @@ static void printAllocatableOp(mlir::OpAsmPrinter &p, OP &op) {
     p << ", ";
     p.printOperand(sh);
   }
-  p.printOptionalAttrDict(op->getAttrs(), {"in_type", "operand_segment_sizes"});
+  p.printOptionalAttrDict(op->getAttrs(), {"in_type", "operandSegmentSizes"});
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index bf03c24fee75c..d67198d97699e 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -28,7 +28,7 @@ func.func @_QPsb1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<!
 // CHECK:    %[[ONE_2:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK: omp.parallel   {
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr<i32>
 // CHECK: omp.wsloop nowait
 // CHECK-SAME: for (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) inclusive step (%[[ONE_2]]) {
@@ -200,7 +200,7 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:    %[[ONE_2:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK: omp.parallel   {
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+// CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr<i32>
 // CHECK: omp.simdloop
 // CHECK-SAME: (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
@@ -231,13 +231,13 @@ func.func @_QPomp_target_data() {
 
 // CHECK-LABEL:   llvm.func @_QPomp_target_data() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           omp.target_enter_data   map((to -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>), (to -> %[[VAL_3]] : !llvm.ptr<array<1024 x i32>>), (always, alloc -> %[[VAL_5]] : !llvm.ptr<array<1024 x i32>>))
 // CHECK:           omp.target_exit_data   map((from -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>), (from -> %[[VAL_3]] : !llvm.ptr<array<1024 x i32>>), (release -> %[[VAL_5]] : !llvm.ptr<array<1024 x i32>>), (always, delete -> %[[VAL_7]] : !llvm.ptr<array<1024 x i32>>))
 // CHECK:           llvm.return
@@ -278,9 +278,9 @@ func.func @_QPopenmp_target_data_region() {
 
 // CHECK-LABEL:   llvm.func @_QPopenmp_target_data_region() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEa"} : (i64) -> !llvm.ptr<array<1024 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEi"} : (i64) -> !llvm.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_data_regionEi"} : (i64) -> !llvm.ptr<i32>
 // CHECK:           omp.target_data   map((tofrom -> %[[VAL_1]] : !llvm.ptr<array<1024 x i32>>)) {
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:             %[[VAL_5:.*]] = llvm.sext %[[VAL_4]] : i32 to i64
@@ -338,7 +338,7 @@ func.func @_QPomp_target() {
 
 // CHECK-LABEL:   llvm.func @_QPomp_target() {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<512 x i32> {bindc_name = "a", in_type = !fir.array<512xi32>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_targetEa"} : (i64) -> !llvm.ptr<array<512 x i32>>
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<512 x i32> {bindc_name = "a", in_type = !fir.array<512xi32>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_targetEa"} : (i64) -> !llvm.ptr<array<512 x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map((tofrom -> %[[VAL_1]] : !llvm.ptr<array<512 x i32>>)) {
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
@@ -544,7 +544,7 @@ func.func @_QPsb() {
 // CHECK:  llvm.func @_QPsb() {
 // CHECK:    %[[ONE:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:    %[[SIZE:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:    %[[LI_REF:.*]] = llvm.alloca %6 x i32 {bindc_name = "li", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFsbEli"} : (i64) -> !llvm.ptr<i32>
+// CHECK:    %[[LI_REF:.*]] = llvm.alloca %6 x i32 {bindc_name = "li", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFsbEli"} : (i64) -> !llvm.ptr<i32>
 // CHECK:    omp.sections   {
 // CHECK:      omp.section {
 // CHECK:        llvm.br ^[[BB_ENTRY:.*]]({{.*}})
@@ -582,7 +582,7 @@ func.func @_QPsb() {
 // CHECK:  }
 // CHECK-LABEL:  @_QPsimple_reduction
 // CHECK-SAME: %[[ARRAY_REF:.*]]: !llvm.ptr<array<100 x i32>>
-// CHECK:    %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x", in_type = !fir.logical<4>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFsimple_reductionEx"} : (i64) -> !llvm.ptr<i32>
+// CHECK:    %[[RED_ACCUMULATOR:.*]] = llvm.alloca %2 x i32 {bindc_name = "x", in_type = !fir.logical<4>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFsimple_reductionEx"} : (i64) -> !llvm.ptr<i32>
 // CHECK:    omp.parallel   {
 // CHECK:      omp.wsloop   reduction(@[[EQV_REDUCTION]] -> %[[RED_ACCUMULATOR]] : !llvm.ptr<i32>) for
 // CHECK:        %[[ARRAY_ELEM_REF:.*]] = llvm.getelementptr %[[ARRAY_REF]][0, %{{.*}}] : (!llvm.ptr<array<100 x i32>>, i64) -> !llvm.ptr<i32>
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index d0c154fb0376e..52716afe3198d 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -1748,7 +1748,7 @@ func.func @no_reassoc(%arg0: !fir.ref<i32>) {
 // CHECK-LABEL: llvm.func @no_reassoc(
 // CHECK-SAME:                        %[[ARG0:.*]]: !llvm.ptr<i32>) {
 // CHECK:         %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[ALLOC:.*]] = llvm.alloca %[[C1]] x i32 {in_type = i32, operand_segment_sizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
+// CHECK:         %[[ALLOC:.*]] = llvm.alloca %[[C1]] x i32 {in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
 // CHECK:         %[[LOAD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr<i32>
 // CHECK:         llvm.store %[[LOAD]], %[[ALLOC]] : !llvm.ptr<i32>
 // CHECK:         llvm.return
@@ -1868,7 +1868,7 @@ func.func private @_QPxb(!fir.box<!fir.array<?x?xf64>>)
 // CHECK:         %[[C1_0:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:         %[[ARR_SIZE_TMP1:.*]] = llvm.mul %[[C1_0]], %[[N1]]  : i64
 // CHECK:         %[[ARR_SIZE:.*]] = llvm.mul %[[ARR_SIZE_TMP1]], %[[N2]]  : i64
-// CHECK:         %[[ARR:.*]] = llvm.alloca %[[ARR_SIZE]] x f64 {bindc_name = "arr", in_type = !fir.array<?x?xf64>, operand_segment_sizes = array<i32: 0, 2>, uniq_name = "_QFsbEarr"} : (i64) -> !llvm.ptr<f64>
+// CHECK:         %[[ARR:.*]] = llvm.alloca %[[ARR_SIZE]] x f64 {bindc_name = "arr", in_type = !fir.array<?x?xf64>, operandSegmentSizes = array<i32: 0, 2>, uniq_name = "_QFsbEarr"} : (i64) -> !llvm.ptr<f64>
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(28 : i32) : i32
 // CHECK:         %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<f64>
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
@@ -1945,9 +1945,9 @@ func.func private @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>)
 // CHECK:         %[[C10:.*]] = llvm.mlir.constant(10 : i64) : i64
 // CHECK:         %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
 // CHECK:         %[[ALLOCA_SIZE_V:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[V:.*]] = llvm.alloca %[[ALLOCA_SIZE_V]] x i32 {bindc_name = "v", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEv"} : (i64) -> !llvm.ptr<i32>
+// CHECK:         %[[V:.*]] = llvm.alloca %[[ALLOCA_SIZE_V]] x i32 {bindc_name = "v", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEv"} : (i64) -> !llvm.ptr<i32>
 // CHECK:         %[[ALLOCA_SIZE_X:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[X:.*]] = llvm.alloca %[[ALLOCA_SIZE_X]] x !llvm.array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>> {bindc_name = "x", in_type = !fir.array<20x!fir.type<_QFtest_dt_sliceTt{i:i32,j:i32}>>, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEx"} : (i64) -> !llvm.ptr<array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>>>
+// CHECK:         %[[X:.*]] = llvm.alloca %[[ALLOCA_SIZE_X]] x !llvm.array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>> {bindc_name = "x", in_type = !fir.array<20x!fir.type<_QFtest_dt_sliceTt{i:i32,j:i32}>>, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_dt_sliceEx"} : (i64) -> !llvm.ptr<array<20 x struct<"_QFtest_dt_sliceTt", (i32, i32)>>>
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(9 : i32) : i32
 // CHECK:         %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<i32>
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 8428d4ba991ef..8fe5ef35a7603 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -383,7 +383,7 @@ Example output is shown below:
 ```
 //===-------------------------------------------===//
 Processing operation : 'cf.cond_br'(0x60f000001120) {
-  "cf.cond_br"(%arg0)[^bb2, ^bb2] {operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+  "cf.cond_br"(%arg0)[^bb2, ^bb2] {operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 
   * Pattern SimplifyConstCondBranchPred : 'cf.cond_br -> ()' {
   } -> failure : pattern failed to match
diff --git a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
index dfa97c865118f..9f15ca767abf9 100644
--- a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
+++ b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
@@ -223,6 +223,18 @@ def IRDL_OperandsOp : IRDL_Op<"operands", [HasParent<"OperationOp">]> {
 
     The `mul` operation will expect two operands of type `cmath.complex`, that
     have the same type, and return a result of the same type.
+
+    The operands can also be marked as variadic or optional:
+    ```mlir
+    irdl.operands(%0, single %1, optional %2, variadic %3)
+    ```
+    
+    Here, %0 and %1 are required single operands, %2 is an optional operand,
+    and %3 is a variadic operand.
+
+    When more than one operand is marked as optional or variadic, the operation
+    will expect a 'operandSegmentSizes' attribute that defines the number of
+    operands in each segment.
   }];
 
   let arguments = (ins Variadic<IRDL_AttributeType>:$args);
@@ -254,6 +266,18 @@ def IRDL_ResultsOp : IRDL_Op<"results", [HasParent<"OperationOp">]> {
 
     The operation will expect one operand of the `cmath.complex` type, and two
     results that have the underlying type of the `cmath.complex`.
+
+    The results can also be marked as variadic or optional:
+    ```mlir
+    irdl.results(%0, single %1, optional %2, variadic %3)
+    ```
+    
+    Here, %0 and %1 are required single results, %2 is an optional result,
+    and %3 is a variadic result.
+
+    When more than one result is marked as optional or variadic, the operation
+    will expect a 'resultSegmentSizes' attribute that defines the number of
+    results in each segment.
   }];
 
   let arguments = (ins Variadic<IRDL_AttributeType>:$args);
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 1efd2b6b63dd9..4567b3f1902d7 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -874,23 +874,6 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
       return cast<DestinationStyleOpInterface>(*this->getOperation())
           .hasTensorSemantics();
     }
-
-    //========================================================================//
-    // Helper functions to mutate the `operand_segment_sizes` attribute.
-    // These are useful when cloning and changing operand types.
-    //========================================================================//
-    void setNumInputs(unsigned num) { setOperandSegmentAt(0, num); }
-    void setNumOutputBuffers(unsigned num) { setOperandSegmentAt(1, num); }
-
-    private:
-    void setOperandSegmentAt(unsigned idx, unsigned val) {
-      auto attr = ::llvm::cast<DenseIntElementsAttr>(
-                      (*this)->getAttr("operand_segment_sizes"));
-      unsigned i = 0;
-      auto newAttr = attr.mapValues(IntegerType::get(getContext(), 32),
-        [&](const APInt &v) { return (i++ == idx) ? APInt(32, val) : v; });
-      getOperation()->setAttr("operand_segment_sizes", newAttr);
-    }
   }];
 
   let verify = [{ return detail::verifyStructuredOpInterface($_op); }];
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 274a531f4061e..f25106b1593a3 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -2178,7 +2178,7 @@ def SameVariadicOperandSize : GenInternalOpTrait<"SameVariadicOperandSize">;
 // to have the same array size.
 def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
 
-// Uses an attribute named `operand_segment_sizes` to specify how many actual
+// Uses an attribute named `operandSegmentSizes` to specify how many actual
 // operand each ODS-declared operand (variadic or not) corresponds to.
 // This trait is used for ops that have multiple variadic operands but do
 // not know statically their size relationship. The attribute must be a 1D
@@ -2188,7 +2188,7 @@ def SameVariadicResultSize : GenInternalOpTrait<"SameVariadicResultSize">;
 def AttrSizedOperandSegments :
   NativeOpTrait<"AttrSizedOperandSegments">, StructuralOpTrait;
 // Similar to AttrSizedOperandSegments, but used for results. The attribute
-// should be named as `result_segment_sizes`.
+// should be named as `resultSegmentSizes`.
 def AttrSizedResultSegments  :
   NativeOpTrait<"AttrSizedResultSegments">, StructuralOpTrait;
 
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index d42bffaf32b03..afbd0395b466a 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -1331,7 +1331,7 @@ struct HasParent {
 /// relationship is not always known statically. For such cases, we need
 /// a per-op-instance specification to divide the operands into logical groups
 /// or segments. This can be modeled by attributes. The attribute will be named
-/// as `operand_segment_sizes`.
+/// as `operandSegmentSizes`.
 ///
 /// This trait verifies the attribute for specifying operand segments has
 /// the correct type (1D vector) and values (non-negative), etc.
@@ -1339,9 +1339,7 @@ template <typename ConcreteType>
 class AttrSizedOperandSegments
     : public TraitBase<ConcreteType, AttrSizedOperandSegments> {
 public:
-  static StringRef getOperandSegmentSizeAttr() {
-    return "operand_segment_sizes";
-  }
+  static StringRef getOperandSegmentSizeAttr() { return "operandSegmentSizes"; }
 
   static LogicalResult verifyTrait(Operation *op) {
     return ::mlir::OpTrait::impl::verifyOperandSizeAttr(
@@ -1354,7 +1352,7 @@ template <typename ConcreteType>
 class AttrSizedResultSegments
     : public TraitBase<ConcreteType, AttrSizedResultSegments> {
 public:
-  static StringRef getResultSegmentSizeAttr() { return "result_segment_sizes"; }
+  static StringRef getResultSegmentSizeAttr() { return "resultSegmentSizes"; }
 
   static LogicalResult verifyTrait(Operation *op) {
     return ::mlir::OpTrait::impl::verifyResultSizeAttr(
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 971d2819ade44..c755dc12a311b 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -1675,28 +1675,28 @@ py::object PyOpView::buildGeneric(
     } else {
       attributes = py::dict();
     }
-    if (attributes->contains("result_segment_sizes") ||
-        attributes->contains("operand_segment_sizes")) {
-      throw py::value_error("Manually setting a 'result_segment_sizes' or "
-                            "'operand_segment_sizes' attribute is unsupported. "
+    if (attributes->contains("resultSegmentSizes") ||
+        attributes->contains("operandSegmentSizes")) {
+      throw py::value_error("Manually setting a 'resultSegmentSizes' or "
+                            "'operandSegmentSizes' attribute is unsupported. "
                             "Use Operation.create for such low-level access.");
     }
 
-    // Add result_segment_sizes attribute.
+    // Add resultSegmentSizes attribute.
     if (!resultSegmentLengths.empty()) {
       MlirAttribute segmentLengthAttr =
           mlirDenseI32ArrayGet(context->get(), resultSegmentLengths.size(),
                                resultSegmentLengths.data());
-      (*attributes)["result_segment_sizes"] =
+      (*attributes)["resultSegmentSizes"] =
           PyAttribute(context, segmentLengthAttr);
     }
 
-    // Add operand_segment_sizes attribute.
+    // Add operandSegmentSizes attribute.
     if (!operandSegmentLengths.empty()) {
       MlirAttribute segmentLengthAttr =
           mlirDenseI32ArrayGet(context->get(), operandSegmentLengths.size(),
                                operandSegmentLengths.data());
-      (*attributes)["operand_segment_sizes"] =
+      (*attributes)["operandSegmentSizes"] =
           PyAttribute(context, segmentLengthAttr);
     }
   }
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 9dfe07797ff4b..e6154a329aacc 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -53,7 +53,7 @@ enum class DataArgAction : unsigned char {
 
 // Fix up the fact that, when we're migrating from a general bugffer atomic
 // to a load or to a CAS, the number of openrands, and thus the number of
-// entries needed in operand_segment_sizes, needs to change. We use this method
+// entries needed in operandSegmentSizes, needs to change. We use this method
 // because we'd like to preserve unknown attributes on the atomic instead of
 // discarding them.
 static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
@@ -61,7 +61,7 @@ static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
                                      DataArgAction action) {
   newAttrs.reserve(attrs.size());
   for (NamedAttribute attr : attrs) {
-    if (attr.getName().getValue() != "operand_segment_sizes") {
+    if (attr.getName().getValue() != "operandSegmentSizes") {
       newAttrs.push_back(attr);
       continue;
     }
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 7d018bf8f3a3d..abe6670c7f855 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -61,7 +61,7 @@ YieldOp::getMutableSuccessorOperands(std::optional<unsigned> index) {
 /// ExecuteOp
 //===----------------------------------------------------------------------===//
 
-constexpr char kOperandSegmentSizesAttr[] = "operand_segment_sizes";
+constexpr char kOperandSegmentSizesAttr[] = "operandSegmentSizes";
 
 OperandRange
 ExecuteOp::getSuccessorEntryOperands(std::optional<unsigned> index) {
@@ -100,7 +100,7 @@ void ExecuteOp::build(OpBuilder &builder, OperationState &result,
   result.addOperands(dependencies);
   result.addOperands(operands);
 
-  // Add derived `operand_segment_sizes` attribute based on parsed operands.
+  // Add derived `operandSegmentSizes` attribute based on parsed operands.
   int32_t numDependencies = dependencies.size();
   int32_t numOperands = operands.size();
   auto operandSegmentSizes =
@@ -208,7 +208,7 @@ ParseResult ExecuteOp::parse(OpAsmParser &parser, OperationState &result) {
 
   int32_t numOperands = valueArgs.size();
 
-  // Add derived `operand_segment_sizes` attribute based on parsed operands.
+  // Add derived `operandSegmentSizes` attribute based on parsed operands.
   auto operandSegmentSizes =
       parser.getBuilder().getDenseI32ArrayAttr({numDependencies, numOperands});
   result.addAttribute(kOperandSegmentSizesAttr, operandSegmentSizes);
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index d6778ed72c7d0..4f5452b27e3e0 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -170,7 +170,7 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state,
   state.addTypes(derivedResultTypes);
   state.addAttributes(attributes);
   state.addAttribute(
-      "operand_segment_sizes",
+      "operandSegmentSizes",
       b.getDenseI32ArrayAttr({static_cast<int32_t>(inputs.size()),
                               static_cast<int32_t>(outputs.size())}));
 
@@ -226,18 +226,18 @@ parseCommonStructuredOpParts(OpAsmParser &parser, OperationState &result,
     // This is a bit complex because we're trying to be backward compatible with
     // operation syntax that mix the inherent attributes and the discardable
     // ones in the same dictionary. If the properties are used, we append the
-    // operand_segment_sizes there directly. Otherwise we append it to the
+    // operandSegmentSizes there directly. Otherwise we append it to the
     // discardable attributes dictionary where it is handled by the generic
     // Operation::create(...) method.
     if (result.propertiesAttr) {
       NamedAttrList attrs = llvm::cast<DictionaryAttr>(result.propertiesAttr);
-      attrs.append("operand_segment_sizes",
+      attrs.append("operandSegmentSizes",
                    parser.getBuilder().getDenseI32ArrayAttr(
                        {static_cast<int32_t>(inputsOperands.size()),
                         static_cast<int32_t>(outputsOperands.size())}));
       result.propertiesAttr = attrs.getDictionary(parser.getContext());
     } else {
-      result.addAttribute("operand_segment_sizes",
+      result.addAttribute("operandSegmentSizes",
                           parser.getBuilder().getDenseI32ArrayAttr(
                               {static_cast<int32_t>(inputsOperands.size()),
                                static_cast<int32_t>(outputsOperands.size())}));
@@ -332,7 +332,7 @@ static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op,
                                    ValueRange inputs, ValueRange outputs) {
   p.printOptionalAttrDict(
       op->getAttrs(),
-      /*elidedAttrs=*/{"operand_segment_sizes",
+      /*elidedAttrs=*/{"operandSegmentSizes",
                        // See generated code in
                        // LinalgNamedStructuredOps.yamlgen.cpp.inc
                        "linalg.memoized_indexing_maps"});
@@ -878,7 +878,7 @@ void GenericOp::print(OpAsmPrinter &p) {
   printCommonStructuredOpParts(p, SmallVector<Value>(getDpsInputOperands()),
                                SmallVector<Value>(getDpsInitOperands()));
 
-  genericAttrNames.push_back("operand_segment_sizes");
+  genericAttrNames.push_back("operandSegmentSizes");
   genericAttrNamesSet.insert(genericAttrNames.back());
 
   bool hasExtraAttrs = false;
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index aaa5e39cd2f3d..a7b516e1e8640 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1365,7 +1365,7 @@ ParseResult ForallOp::parse(OpAsmParser &parser, OperationState &result) {
   result.addAttribute("staticLowerBound", staticLbs);
   result.addAttribute("staticUpperBound", staticUbs);
   result.addAttribute("staticStep", staticSteps);
-  result.addAttribute("operand_segment_sizes",
+  result.addAttribute("operandSegmentSizes",
                       parser.getBuilder().getDenseI32ArrayAttr(
                           {static_cast<int32_t>(dynamicLbs.size()),
                            static_cast<int32_t>(dynamicUbs.size()),
@@ -1400,7 +1400,7 @@ void ForallOp::build(
   result.addAttribute(getStaticStepAttrName(result.name),
                       b.getDenseI64ArrayAttr(staticSteps));
   result.addAttribute(
-      "operand_segment_sizes",
+      "operandSegmentSizes",
       b.getDenseI32ArrayAttr({static_cast<int32_t>(dynamicLbs.size()),
                               static_cast<int32_t>(dynamicUbs.size()),
                               static_cast<int32_t>(dynamicSteps.size()),
@@ -1601,7 +1601,7 @@ struct ForallOpSingleOrZeroIterationDimsFolder
                                       op.getOutputs(), std::nullopt, nullptr);
     newOp.getBodyRegion().getBlocks().clear();
     // The new loop needs to keep all attributes from the old one, except for
-    // "operand_segment_sizes" and static loop bound attributes which capture
+    // "operandSegmentSizes" and static loop bound attributes which capture
     // the outdated information of the old iteration domain.
     SmallVector<StringAttr> elidedAttrs{newOp.getOperandSegmentSizesAttrName(),
                                         newOp.getStaticLowerBoundAttrName(),
@@ -2833,7 +2833,7 @@ ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
   if (parser.parseRegion(*body, ivs))
     return failure();
 
-  // Set `operand_segment_sizes` attribute.
+  // Set `operandSegmentSizes` attribute.
   result.addAttribute(
       ParallelOp::getOperandSegmentSizeAttr(),
       builder.getDenseI32ArrayAttr({static_cast<int32_t>(lower.size()),
diff --git a/mlir/lib/Rewrite/ByteCode.cpp b/mlir/lib/Rewrite/ByteCode.cpp
index c8c442823781b..e7d4c4089a991 100644
--- a/mlir/lib/Rewrite/ByteCode.cpp
+++ b/mlir/lib/Rewrite/ByteCode.cpp
@@ -1846,7 +1846,7 @@ void ByteCodeExecutor::executeGetOperands() {
   ByteCodeField rangeIndex = read();
 
   void *result = executeGetOperandsResults<OpTrait::AttrSizedOperandSegments>(
-      op->getOperands(), op, index, rangeIndex, "operand_segment_sizes",
+      op->getOperands(), op, index, rangeIndex, "operandSegmentSizes",
       valueRangeMemory);
   if (!result)
     LLVM_DEBUG(llvm::dbgs() << "  * Invalid operand range\n");
@@ -1872,7 +1872,7 @@ void ByteCodeExecutor::executeGetResults() {
   ByteCodeField rangeIndex = read();
 
   void *result = executeGetOperandsResults<OpTrait::AttrSizedResultSegments>(
-      op->getResults(), op, index, rangeIndex, "result_segment_sizes",
+      op->getResults(), op, index, rangeIndex, "resultSegmentSizes",
       valueRangeMemory);
   if (!result)
     LLVM_DEBUG(llvm::dbgs() << "  * Invalid result range\n");
diff --git a/mlir/test/Bytecode/operand_segment_sizes.mlir b/mlir/test/Bytecode/operand_segment_sizes.mlir
index 9791bd4e0f264..c0379c2994f49 100644
--- a/mlir/test/Bytecode/operand_segment_sizes.mlir
+++ b/mlir/test/Bytecode/operand_segment_sizes.mlir
@@ -2,7 +2,7 @@
 
 
 func.func @roundtripOperandSizeAttr(%arg0: i32) {
-  // CHECK: odsOperandSegmentSizes = array<i32: 0, 2, 1, 1>}>
-  "test.attr_sized_operands"(%arg0, %arg0, %arg0, %arg0) <{odsOperandSegmentSizes = array<i32: 0, 2, 1, 1>}> : (i32, i32, i32, i32) -> ()
+  // CHECK: operandSegmentSizes = array<i32: 0, 2, 1, 1>}>
+  "test.attr_sized_operands"(%arg0, %arg0, %arg0, %arg0) <{operandSegmentSizes = array<i32: 0, 2, 1, 1>}> : (i32, i32, i32, i32) -> ()
   return
 }
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index ab91729a0556b..b83b122f75e4b 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -79,7 +79,7 @@ func.func @wsloop(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4:
       // CHECK: "test.payload"(%[[CAST_ARG6]], %[[CAST_ARG7]]) : (index, index) -> ()
       "test.payload"(%arg6, %arg7) : (index, index) -> ()
       omp.yield
-    }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
+    }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0>} : (index, index, index, index, index, index) -> ()
     omp.terminator
   }
   return
@@ -328,7 +328,7 @@ llvm.func @_QPsimple_reduction(%arg0: !llvm.ptr<array<100 x i32>> {fir.bindc_nam
   %5 = llvm.zext %2 : i1 to i32
   llvm.store %5, %4 : !llvm.ptr<i32>
   omp.parallel   {
-    %6 = llvm.alloca %3 x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
+    %6 = llvm.alloca %3 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>, pinned} : (i64) -> !llvm.ptr<i32>
     omp.wsloop   reduction(@eqv_reduction -> %4 : !llvm.ptr<i32>) for  (%arg1) : i32 = (%1) to (%0) inclusive step (%1) {
       llvm.store %arg1, %6 : !llvm.ptr<i32>
       %7 = llvm.load %6 : !llvm.ptr<i32>
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index e280cd65811db..80c65e14e7635 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -4,7 +4,7 @@ func.func @not_enough_sizes(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands, but found 5}}
   "gpu.launch"(%sz, %sz, %sz, %sz, %sz) ({
     gpu.return
-  }) {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index) -> ()
   return
 }
 
@@ -16,7 +16,7 @@ func.func @no_region_attrs(%sz : index) {
   ^bb1(%bx: index, %by: index, %bz: index,
        %tx: index, %ty: index, %tz: index):
     gpu.terminator
-  }) {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0>} : (index, index, index, index, index, index) -> ()
   return
 }
 
@@ -38,7 +38,7 @@ func.func @launch_requires_gpu_return(%sz : index) {
 func.func @launch_func_too_few_operands(%sz : index) {
   // expected-error@+1 {{expected 6 or more operands}}
   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
-      {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 0, 0>}
+      {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 0, 0>}
       : (index, index, index, index, index) -> ()
   return
 }
@@ -57,7 +57,7 @@ module attributes {gpu.container_module} {
   func.func @launch_func_missing_callee_attribute(%sz : index) {
     // expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
-        {operand_segment_sizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0>}
+        {operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0>}
         : (index, index, index, index, index, index) -> ()
     return
   }
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 14141c4c243ab..cf4697b17aa46 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -887,7 +887,7 @@ func.func @switch_wrong_number_of_weights(%arg0 : i32) {
 
 func.func @switch_case_type_mismatch(%arg0 : i64) {
   // expected-error@below {{expects case value type to match condition value type}}
-  "llvm.switch"(%arg0)[^bb1, ^bb2] <{case_operand_segments = array<i32: 0>, case_values = dense<42> : vector<1xi32>, odsOperandSegmentSizes = array<i32: 1, 0, 0>}> : (i64) -> ()
+  "llvm.switch"(%arg0)[^bb1, ^bb2] <{case_operand_segments = array<i32: 0>, case_values = dense<42> : vector<1xi32>, operandSegmentSizes = array<i32: 1, 0, 0>}> : (i64) -> ()
 ^bb1: // pred: ^bb0
   llvm.return
 ^bb2: // pred: ^bb0
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 8f00d54655327..b0bb06cc8654a 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -178,7 +178,7 @@ func.func @depthwise_conv_2d_input_nhwc_filter_default_attributes(%input: memref
 // -----
 
 func.func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_element_type_properties(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
-  // expected-error @+1 {{invalid properties {dilations = dense<1> : vector<2xi64>, operand_segment_sizes = array<i32: 2, 1>, strides = dense<2.000000e+00> : vector<2xf32>} for op linalg.depthwise_conv_2d_nhwc_hwc: Invalid attribute `strides` in property conversion: dense<2.000000e+00> : vector<2xf32>}}
+  // expected-error @+1 {{invalid properties {dilations = dense<1> : vector<2xi64>, operandSegmentSizes = array<i32: 2, 1>, strides = dense<2.000000e+00> : vector<2xf32>} for op linalg.depthwise_conv_2d_nhwc_hwc: Invalid attribute `strides` in property conversion: dense<2.000000e+00> : vector<2xf32>}}
   linalg.depthwise_conv_2d_nhwc_hwc <{dilations = dense<1> : vector<2xi64>, strides = dense<2.0> : vector<2xf32>}>
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
@@ -1100,7 +1100,7 @@ func.func @conv_interface_wrong_input_indexing_map(
       %1 = "arith.mulf"(%arg3, %arg4) : (f32, f32) -> f32
       %2 = "arith.addf"(%arg5, %1) : (f32, f32) -> f32
       "linalg.yield"(%2) : (f32) -> ()
-    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operand_segment_sizes = array<i32: 2, 1>, strides = dense<2> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operandSegmentSizes = array<i32: 2, 1>, strides = dense<2> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
   return %0 : tensor<?x?x?x?xf32>
 }
 
@@ -1117,7 +1117,7 @@ func.func @conv_interface_wrong_num_operands(
       %1 = "arith.mulf"(%arg3, %arg4) : (f32, f32) -> f32
       %2 = "arith.addf"(%arg5, %1) : (f32, f32) -> f32
       "linalg.yield"(%2) : (f32) -> ()
-    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operand_segment_sizes = array<i32: 2, 1>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+    }) {dilations = dense<1> : tensor<2xi64>, linalg.memoized_indexing_maps = [#map0, #map1, #map2], operandSegmentSizes = array<i32: 2, 1>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
   return %0 : tensor<?x?x?x?xf32>
 }
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index fc65fb77ffc88..009f08ced97e0 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -197,7 +197,7 @@ func.func @omp_simdloop(%lb : index, %ub : index, %step : i32) -> () {
   "omp.simdloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (index, index, i32) -> ()
 
   return
@@ -225,7 +225,7 @@ func.func @omp_simdloop_aligned_mismatch(%arg0 : index, %arg1 : index,
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [128],
-      operand_segment_sizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -238,7 +238,7 @@ func.func @omp_simdloop_aligned_negative(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [-1, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -251,7 +251,7 @@ func.func @omp_simdloop_unexpected_alignment(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0>} : (index, index, index) -> ()
+  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0>} : (index, index, index) -> ()
   return
 }
 
@@ -264,7 +264,7 @@ func.func @omp_simdloop_aligned_float(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1.5, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -277,7 +277,7 @@ func.func @omp_simdloop_aligned_the_same_var(%arg0 : index, %arg1 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operand_segment_sizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -291,7 +291,7 @@ func.func @omp_simdloop_nontemporal_the_same_var(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -1121,7 +1121,7 @@ func.func @omp_teams_allocate(%data_var : memref<i32>) {
     // expected-error @below {{expected equal sizes for allocate and allocator variables}}
     "omp.teams" (%data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
     omp.terminator
   }
   return
@@ -1134,7 +1134,7 @@ func.func @omp_teams_num_teams1(%lb : i32) {
     // expected-error @below {{expected num_teams upper bound to be defined if the lower bound is defined}}
     "omp.teams" (%lb) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
     omp.terminator
   }
   return
@@ -1159,7 +1159,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1169,7 +1169,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1284,7 +1284,7 @@ func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operand_segment_sizes = array<i32: 1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1294,7 +1294,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.task"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operand_segment_sizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
+    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -1486,7 +1486,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testmemref) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
   return
 }
 
@@ -1499,7 +1499,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1512,7 +1512,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1525,7 +1525,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32], operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
+  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>, !llvm.ptr<f32>) -> ()
   return
 }
 
@@ -1538,7 +1538,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32_2) ({
   ^bb0(%arg3: i32, %arg4: i32):
     "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32, @add_f32], operand_segment_sizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
+  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr<f32>) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 2f0d224a3fef7..be59defd27d03 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -59,7 +59,7 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%num_threads, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 0,1,1,1,0>} : (i32, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 0,1,1,1,0>} : (i32, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
@@ -68,22 +68,22 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%if_cond, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,0,1,1,0>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,1,1,0>} : (i1, memref<i32>, memref<i32>) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operand_segment_sizes = array<i32: 1,1,0,0,0>} : (i1, i32) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,0,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0>} : (memref<i32>, memref<i32>) -> ()
 
   return
 }
@@ -141,7 +141,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0,0>, ordered_val = 1} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0,0>, ordered_val = 1} :
     (index, index, index) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static)
@@ -149,7 +149,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,1,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,1,0,0>, schedule_val = #omp<schedulekind static>} :
     (index, index, index, memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static)
@@ -157,7 +157,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,2,2,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,2,2,0,0>, schedule_val = #omp<schedulekind static>} :
     (index, index, index, memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) ordered(2)
@@ -165,7 +165,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var, %chunk_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
     (index, index, index, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop schedule(auto) nowait
@@ -173,7 +173,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
   "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
     (index, index, index) -> ()
 
   return
@@ -333,7 +333,7 @@ func.func @omp_simdloop(%lb : index, %ub : index, %step : index) -> () {
   "omp.simdloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (index, index, index) -> ()
 
   return
@@ -349,7 +349,7 @@ func.func @omp_simdloop_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [32, 128],
-      operand_segment_sizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -362,7 +362,7 @@ func.func @omp_simdloop_aligned_single(%arg0 : index, %arg1 : index, %arg2 : ind
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
   }) {alignment_values = [32],
-      operand_segment_sizes = array<i32: 1, 1, 1, 1, 0, 0>} : (index, index, index, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0>} : (index, index, index, memref<i32>) -> ()
   return
 }
 
@@ -377,7 +377,7 @@ func.func @omp_simdloop_nontemporal_list(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i64>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i64>) -> ()
   return
 }
 
@@ -392,7 +392,7 @@ func.func @omp_simdloop_nontemporal_single(%arg0 : index,
   "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({
     ^bb0(%arg5: index):
       "omp.yield"() : () -> ()
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 1>} : (index, index, index, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1>} : (index, index, index, memref<i32>) -> ()
   return
 }
 
@@ -487,7 +487,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1:
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operand_segment_sizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: omp.target map((tofrom -> %{{.*}} : memref<?xi32>), (alloc -> %{{.*}} : memref<?xi32>)) {
@@ -1428,13 +1428,13 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr<f32>)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = array<i32: 1,0,0>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {
diff --git a/mlir/test/Dialect/PDL/invalid.mlir b/mlir/test/Dialect/PDL/invalid.mlir
index c76bc9dcad72d..c6b7fe1cc1789 100644
--- a/mlir/test/Dialect/PDL/invalid.mlir
+++ b/mlir/test/Dialect/PDL/invalid.mlir
@@ -122,7 +122,7 @@ pdl.pattern : benefit(1) {
   // expected-error@below {{expected the same number of attribute values and attribute names, got 1 names and 0 values}}
   %op = "pdl.operation"() {
     attributeValueNames = ["attr"],
-    operand_segment_sizes = array<i32: 0, 0, 0>
+    operandSegmentSizes = array<i32: 0, 0, 0>
   } : () -> (!pdl.operation)
   rewrite %op with "rewriter"
 }
@@ -230,7 +230,7 @@ pdl.pattern : benefit(1) {
 
     // expected-error@below {{expected no replacement values to be provided when the replacement operation is present}}
     "pdl.replace"(%root, %newOp, %newResult) {
-      operand_segment_sizes = array<i32: 1, 1, 1>
+      operandSegmentSizes = array<i32: 1, 1, 1>
     } : (!pdl.operation, !pdl.operation, !pdl.value) -> ()
   }
 }
@@ -276,7 +276,7 @@ pdl.pattern : benefit(1) {
 
   // expected-error@below {{expected rewrite region to be non-empty if external name is not specified}}
   "pdl.rewrite"(%op) ({}) {
-    operand_segment_sizes = array<i32: 1,0>
+    operandSegmentSizes = array<i32: 1,0>
   } : (!pdl.operation) -> ()
 }
 
@@ -289,7 +289,7 @@ pdl.pattern : benefit(1) {
   "pdl.rewrite"(%op, %op) ({
     ^bb1:
   }) {
-    operand_segment_sizes = array<i32: 1, 1>
+    operandSegmentSizes = array<i32: 1, 1>
   }: (!pdl.operation, !pdl.operation) -> ()
 }
 
@@ -303,7 +303,7 @@ pdl.pattern : benefit(1) {
     ^bb1:
   }) {
     name = "foo",
-    operand_segment_sizes = array<i32: 1,0>
+    operandSegmentSizes = array<i32: 1,0>
   } : (!pdl.operation) -> ()
 }
 
diff --git a/mlir/test/Dialect/PDLInterp/invalid.mlir b/mlir/test/Dialect/PDLInterp/invalid.mlir
index 0457a158430a2..c201dda71ef7f 100644
--- a/mlir/test/Dialect/PDLInterp/invalid.mlir
+++ b/mlir/test/Dialect/PDLInterp/invalid.mlir
@@ -19,7 +19,7 @@ pdl_interp.func @rewriter() {
     inferredResultTypes,
     inputAttributeNames = [],
     name = "foo.op",
-    operand_segment_sizes = array<i32: 0, 0, 1>
+    operandSegmentSizes = array<i32: 0, 0, 1>
   } : (!pdl.type) -> (!pdl.operation)
   pdl_interp.finalize
 }
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 8566943ef8012..0ff3eaadc8fec 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -139,7 +139,7 @@ func.func @parallel_body_arguments_wrong_type(
   "scf.parallel"(%arg0, %arg1, %arg2) ({
     ^bb0(%i0: f32):
       scf.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
   return
 }
 
@@ -151,7 +151,7 @@ func.func @parallel_body_wrong_number_of_arguments(
   "scf.parallel"(%arg0, %arg1, %arg2) ({
     ^bb0(%i0: index, %i1: index):
       scf.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>}: (index, index, index) -> ()
   return
 }
 
@@ -689,7 +689,7 @@ func.func @parallel_missing_terminator(%0 : index) {
   ^bb0(%arg1: index):
     // expected-note @below {{terminator here}}
     %2 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0>} : (index, index, index) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0>} : (index, index, index) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
index af3f3ea2889f7..8496448759f0c 100644
--- a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir
@@ -117,7 +117,7 @@ func.func @wrong_condition_type() -> () {
 func.func @wrong_accessor_count() -> () {
   %true = spirv.Constant true
   // expected-error @+1 {{requires 2 successors but found 1}}
-  "spirv.BranchConditional"(%true)[^one] {operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+  "spirv.BranchConditional"(%true)[^one] {operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 ^one:
   spirv.Return
 ^two:
@@ -130,7 +130,7 @@ func.func @wrong_number_of_weights() -> () {
   %true = spirv.Constant true
   // expected-error @+1 {{must have exactly two branch weights}}
   "spirv.BranchConditional"(%true)[^one, ^two] {branch_weights = [1 : i32, 2 : i32, 3 : i32],
-                                              operand_segment_sizes = array<i32: 1, 0, 0>} : (i1) -> ()
+                                              operandSegmentSizes = array<i32: 1, 0, 0>} : (i1) -> ()
 ^one:
   spirv.Return
 ^two:
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
index c72af7363f67f..3e30947769eb4 100644
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -76,7 +76,7 @@ transform.sequence failures(propagate) {
 "transform.sequence"(%0) ({
 ^bb0(%arg0: !transform.any_op):
   "transform.yield"() : () -> ()
-}) {failure_propagation_mode = 1 : i32, operand_segment_sizes = array<i32: 0, 1>} : (!transform.any_op) -> ()
+}) {failure_propagation_mode = 1 : i32, operandSegmentSizes = array<i32: 0, 1>} : (!transform.any_op) -> ()
 
 // -----
 
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index 66c9adca8f98c..0193fae37af7f 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -460,7 +460,7 @@ func.func @verbose_terminators() -> (i1, i17) {
 
 ^bb1(%x : i1, %y : i17):
 // CHECK:  cf.cond_br %{{.*}}, ^bb2(%{{.*}} : i17), ^bb3(%{{.*}}, %{{.*}} : i1, i17)
-  "cf.cond_br"(%x, %y, %x, %y) [^bb2, ^bb3] {operand_segment_sizes = array<i32: 1, 1, 2>} : (i1, i17, i1, i17) -> ()
+  "cf.cond_br"(%x, %y, %x, %y) [^bb2, ^bb3] {operandSegmentSizes = array<i32: 1, 1, 2>} : (i1, i17, i1, i17) -> ()
 
 ^bb2(%a : i17):
   %true = arith.constant true
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 7d922ecf67de5..0402ebe758750 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -383,101 +383,101 @@ func.func private @foo()
 // -----
 
 func.func @failedMissingOperandSizeAttr(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
   "test.attr_sized_operands"(%arg, %arg, %arg, %arg) : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongType(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = 10} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = 10} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongElementType(%arg: i32) {
-  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i64: 1, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i64: 1, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrNegativeValue(%arg: i32) {
-  // expected-error @+1 {{'operand_segment_sizes' attribute cannot have negative elements}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 1, 1, -1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{'operandSegmentSizes' attribute cannot have negative elements}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 1, 1, -1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongTotalSize(%arg: i32) {
-  // expected-error @+1 {{operand count (4) does not match with the total size (3) specified in attribute 'operand_segment_sizes'}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 0, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{operand count (4) does not match with the total size (3) specified in attribute 'operandSegmentSizes'}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 0, 1, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @failedOperandSizeAttrWrongCount(%arg: i32) {
-  // expected-error @+1 {{test.attr_sized_operands' op operand count (4) does not match with the total size (0) specified in attribute 'operand_segment_sizes}}
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 2, 1, 1>} : (i32, i32, i32, i32) -> ()
+  // expected-error @+1 {{test.attr_sized_operands' op operand count (4) does not match with the total size (0) specified in attribute 'operandSegmentSizes}}
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 2, 1, 1>} : (i32, i32, i32, i32) -> ()
 }
 
 // -----
 
 func.func @succeededOperandSizeAttr(%arg: i32) {
   // CHECK: test.attr_sized_operands
-  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operand_segment_sizes = array<i32: 0, 2, 1, 1>} : (i32, i32, i32, i32) -> ()
+  "test.attr_sized_operands"(%arg, %arg, %arg, %arg) {operandSegmentSizes = array<i32: 0, 2, 1, 1>} : (i32, i32, i32, i32) -> ()
   return
 }
 
 // -----
 
 func.func @failedMissingResultSizeAttr() {
-  // expected-error @+1 {{op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
+  // expected-error @+1 {{op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
   %0:4 = "test.attr_sized_results"() : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongType() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = 10} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = 10} : () -> (i32, i32, i32, i32)
 }
 
 
 // -----
 
 func.func @failedResultSizeAttrWrongElementType() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i64: 1, 1, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i64: 1, 1, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrNegativeValue() {
-  // expected-error @+1 {{'result_segment_sizes' attribute cannot have negative elements}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 1, 1, -1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{'resultSegmentSizes' attribute cannot have negative elements}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 1, 1, -1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongTotalSize() {
-  // expected-error @+1 {{result count (4) does not match with the total size (3) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 1, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{result count (4) does not match with the total size (3) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 1, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @failedResultSizeAttrWrongCount() {
-  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'result_segment_sizes'}}
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 2, 1, 1>} : () -> (i32, i32, i32, i32)
+  // expected-error @+1 {{ op result count (4) does not match with the total size (0) specified in attribute 'resultSegmentSizes'}}
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 2, 1, 1>} : () -> (i32, i32, i32, i32)
 }
 
 // -----
 
 func.func @succeededResultSizeAttr() {
   // CHECK: test.attr_sized_results
-  %0:4 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 2, 1, 1>} : () -> (i32, i32, i32, i32)
+  %0:4 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 2, 1, 1>} : () -> (i32, i32, i32, i32)
   return
 }
 
diff --git a/mlir/test/Rewrite/pdl-bytecode.mlir b/mlir/test/Rewrite/pdl-bytecode.mlir
index 57bec8ce37073..513ff3c40bc64 100644
--- a/mlir/test/Rewrite/pdl-bytecode.mlir
+++ b/mlir/test/Rewrite/pdl-bytecode.mlir
@@ -1093,7 +1093,7 @@ module @patterns {
 // CHECK-NEXT:  "test.success"(%[[INPUTS]]#4) : (i32) -> ()
 module @ir attributes { test.get_operands_2 } {
   %inputs:5 = "test.producer"() : () -> (i32, i32, i32, i32, i32)
-  "test.attr_sized_operands"(%inputs#0, %inputs#1, %inputs#2, %inputs#3, %inputs#4) {operand_segment_sizes = array<i32: 0, 4, 1, 0>} : (i32, i32, i32, i32, i32) -> ()
+  "test.attr_sized_operands"(%inputs#0, %inputs#1, %inputs#2, %inputs#3, %inputs#4) {operandSegmentSizes = array<i32: 0, 4, 1, 0>} : (i32, i32, i32, i32, i32) -> ()
 }
 
 // -----
@@ -1246,7 +1246,7 @@ module @patterns {
 // CHECK: %[[RESULTS_2_SINGLE:.*]] = "test.success"() : () -> i32
 // CHECK: "test.consumer"(%[[RESULTS_1]]#0, %[[RESULTS_1]]#1, %[[RESULTS_1]]#2, %[[RESULTS_1]]#3, %[[RESULTS_2]]) : (i32, i32, i32, i32, i32) -> ()
 module @ir attributes { test.get_results_2 } {
-  %results:5 = "test.attr_sized_results"() {result_segment_sizes = array<i32: 0, 4, 1, 0>} : () -> (i32, i32, i32, i32, i32)
+  %results:5 = "test.attr_sized_results"() {resultSegmentSizes = array<i32: 0, 4, 1, 0>} : () -> (i32, i32, i32, i32, i32)
   "test.consumer"(%results#0, %results#1, %results#2, %results#3, %results#4) : (i32, i32, i32, i32, i32) -> ()
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 1573f30d5b391..a58d4f1463a0b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -2,7 +2,7 @@
 
 llvm.func @_QPopenmp_target_data() {
   %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr<i32>
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr<i32>
   omp.target_data   map((tofrom -> %1 : !llvm.ptr<i32>)) {
     %2 = llvm.mlir.constant(99 : i32) : i32
     llvm.store %2, %1 : !llvm.ptr<i32>
@@ -79,9 +79,9 @@ llvm.func @_QPopenmp_target_data_region(%1 : !llvm.ptr<array<1024 x i32>>) {
 
 llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr<array<1024 x i32>>, %3 : !llvm.ptr<array<512 x i32>>) {
   %4 = llvm.mlir.constant(1 : i64) : i64
-  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr<i32>
+  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr<i32>
   %6 = llvm.mlir.constant(1 : i64) : i64
-  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr<i32>
+  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr<i32>
   %8 = llvm.mlir.constant(5 : i32) : i32
   llvm.store %8, %7 : !llvm.ptr<i32>
   %9 = llvm.mlir.constant(2 : i32) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index 126fff70ce3b1..bead0200b2731 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = true} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target   {
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
index e130f96af79f7..9ba083d5137d8 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = false} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target   {
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
index cfa8039c94ba2..7f5e79db9bcd6 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
@@ -5,11 +5,11 @@ module attributes {omp.is_target_device = false} {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
     %2 = llvm.mlir.constant(1 : i64) : i64
-    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
+    %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr<i32>
     %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
+    %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr<i32>
     %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
+    %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr<i32>
     llvm.store %1, %3 : !llvm.ptr<i32>
     llvm.store %0, %5 : !llvm.ptr<i32>
     omp.target {
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 6469868b8751f..4fb00660cc423 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -310,7 +310,7 @@ llvm.func @wsloop_simple(%arg0: !llvm.ptr<f32>) {
       llvm.store %3, %4 : !llvm.ptr<f32>
       omp.yield
       // CHECK: call void @__kmpc_for_static_fini(ptr @[[$loc_struct]],
-    }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+    }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
     omp.terminator
   }
   llvm.return
@@ -330,7 +330,7 @@ llvm.func @wsloop_inclusive_1(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
   llvm.return
 }
 
@@ -348,7 +348,7 @@ llvm.func @wsloop_inclusive_2(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {inclusive, operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  }) {inclusive, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
   llvm.return
 }
 
@@ -628,7 +628,7 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr<f
       %4 = llvm.getelementptr %arg0[%iv] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
       llvm.store %3, %4 : !llvm.ptr<f32>
       omp.yield
-  }) {operand_segment_sizes = array<i32: 1,1,1,0,0,0>} :
+  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
     (i64, i64, i64) -> ()
 
   llvm.return
@@ -733,9 +733,9 @@ llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %ste
 // CHECK-LABEL: @simdloop_if
 llvm.func @simdloop_if(%arg0: !llvm.ptr<i32> {fir.bindc_name = "n"}, %arg1: !llvm.ptr<i32> {fir.bindc_name = "threshold"}) {
   %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operand_segment_sizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
+  %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr<i32>
   %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.alloca %2 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFtest_simdEi"} : (i64) -> !llvm.ptr<i32>
+  %3 = llvm.alloca %2 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFtest_simdEi"} : (i64) -> !llvm.ptr<i32>
   %4 = llvm.mlir.constant(0 : i32) : i32
   %5 = llvm.load %arg0 : !llvm.ptr<i32>
   %6 = llvm.mlir.constant(1 : i32) : i32
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index d33c911e042d7..bf44973ab646c 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -257,7 +257,7 @@ func.func @nomerge(%arg0: i32, %i: i32) {
 func.func @mismatch_dominance() -> i32 {
   // CHECK: %[[RES:.*]] = "test.producing_br"()
   %0 = "test.producing_br"()[^bb1, ^bb2] {
-        operand_segment_sizes = array<i32: 0, 0>
+        operandSegmentSizes = array<i32: 0, 0>
 	} : () -> i32
 
 ^bb1:
diff --git a/mlir/test/Transforms/sccp.mlir b/mlir/test/Transforms/sccp.mlir
index db24432b65cc6..dcae052c29c24 100644
--- a/mlir/test/Transforms/sccp.mlir
+++ b/mlir/test/Transforms/sccp.mlir
@@ -204,7 +204,7 @@ func.func @simple_produced_operand() -> (i32, i32) {
   // CHECK: %[[ONE:.*]] = arith.constant 1
   %1 = arith.constant 1 : i32
   "test.internal_br"(%1) [^bb1, ^bb2] {
-    operand_segment_sizes = array<i32: 0, 1>
+    operandSegmentSizes = array<i32: 0, 1>
   } : (i32) -> ()
 
 ^bb1:
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index aad7ea4437e78..077aa750352e0 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -141,7 +141,7 @@ def NS_AttrSizedOperandOp : NS_Op<"attr_sized_operands",
     Variadic<I32>:$b,
     I32:$c,
     Variadic<I32>:$d,
-    I32ElementsAttr:$operand_segment_sizes
+    I32ElementsAttr:$operandSegmentSizes
   );
 }
 
diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td
index de979f7e8f43e..a131209fa45cb 100644
--- a/mlir/test/mlir-tblgen/op-python-bindings.td
+++ b/mlir/test/mlir-tblgen/op-python-bindings.td
@@ -39,7 +39,7 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
   // CHECK: def variadic1(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 0)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 0)
   // CHECK:   return operand_range
   // CHECK-NOT: if len(operand_range)
   //
@@ -47,14 +47,14 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
   // CHECK: def non_variadic(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 1)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 1)
   // CHECK:   return operand_range[0]
   //
   // CHECK: @builtins.property
   // CHECK: def variadic2(self):
   // CHECK:   operand_range = _ods_segmented_accessor(
   // CHECK:       self.operation.operands,
-  // CHECK:       self.operation.attributes["operand_segment_sizes"], 2)
+  // CHECK:       self.operation.attributes["operandSegmentSizes"], 2)
   // CHECK:   return operand_range[0] if len(operand_range) > 0 else None
   let arguments = (ins Variadic<AnyType>:$variadic1, AnyType:$non_variadic,
                    Optional<AnyType>:$variadic2);
@@ -83,21 +83,21 @@ def AttrSizedResultsOp : TestOp<"attr_sized_results",
   // CHECK: def variadic1(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 0)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 0)
   // CHECK:   return result_range[0] if len(result_range) > 0 else None
   //
   // CHECK: @builtins.property
   // CHECK: def non_variadic(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 1)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 1)
   // CHECK:   return result_range[0]
   //
   // CHECK: @builtins.property
   // CHECK: def variadic2(self):
   // CHECK:   result_range = _ods_segmented_accessor(
   // CHECK:       self.operation.results,
-  // CHECK:       self.operation.attributes["result_segment_sizes"], 2)
+  // CHECK:       self.operation.attributes["resultSegmentSizes"], 2)
   // CHECK:   return result_range
   // CHECK-NOT: if len(result_range)
   let results = (outs Optional<AnyType>:$variadic1, AnyType:$non_variadic,
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 88f48d0d544e7..b728e00837814 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -100,7 +100,7 @@ def named_form(lhs, rhs):
                 init_result = tensor.EmptyOp([4, 8], f32)
                 #      CHECK: "linalg.matmul"(%{{.*}})
                 # CHECK-SAME:    cast = #linalg.type_fn<cast_signed>
-                # CHECK-SAME:    odsOperandSegmentSizes = array<i32: 2, 1>
+                # CHECK-SAME:    operandSegmentSizes = array<i32: 2, 1>
                 # CHECK-NEXT:  ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32):
                 # CHECK-NEXT:    arith.mulf{{.*}} (f32, f32) -> f32
                 # CHECK-NEXT:    arith.addf{{.*}} (f32, f32) -> f32
diff --git a/mlir/test/python/dialects/ods_helpers.py b/mlir/test/python/dialects/ods_helpers.py
index 71879bdcb51f5..0d2a18e0eb0af 100644
--- a/mlir/test/python/dialects/ods_helpers.py
+++ b/mlir/test/python/dialects/ods_helpers.py
@@ -96,8 +96,8 @@ class TestOp(OpView):
             # CHECK: %[[V0:.+]] = "custom.value"
             # CHECK: %[[V1:.+]] = "custom.value"
             # CHECK: "custom.test_op"(%[[V0]], %[[V1]])
-            # CHECK-NOT: operand_segment_sizes
-            # CHECK-NOT: result_segment_sizes
+            # CHECK-NOT: operandSegmentSizes
+            # CHECK-NOT: resultSegmentSizes
             # CHECK-SAME: : (i32, i32) -> (i8, i16)
             print(m)
 
@@ -128,8 +128,8 @@ class TestOp(OpView):
             # CHECK: %[[V2:.+]] = "custom.value"
             # CHECK: %[[V3:.+]] = "custom.value"
             # CHECK: "custom.test_op"(%[[V0]], %[[V1]], %[[V2]], %[[V3]])
-            # CHECK-SAME: operand_segment_sizes = array<i32: 1, 2, 1>
-            # CHECK-SAME: result_segment_sizes = array<i32: 2, 1, 1>
+            # CHECK-SAME: operandSegmentSizes = array<i32: 1, 2, 1>
+            # CHECK-SAME: resultSegmentSizes = array<i32: 2, 1, 1>
             # CHECK-SAME: : (i32, i32, i32, i32) -> (i8, i16, i32, i64)
             op = TestOp.build_generic(
                 results=[[t0, t1], t2, t3], operands=[v0, [v1, v2], v3]
@@ -137,8 +137,8 @@ class TestOp(OpView):
 
             # Now test with optional omitted.
             # CHECK: "custom.test_op"(%[[V0]])
-            # CHECK-SAME: operand_segment_sizes = array<i32: 1, 0, 0>
-            # CHECK-SAME: result_segment_sizes = array<i32: 0, 0, 1>
+            # CHECK-SAME: operandSegmentSizes = array<i32: 1, 0, 0>
+            # CHECK-SAME: resultSegmentSizes = array<i32: 0, 0, 1>
             # CHECK-SAME: (i32) -> i64
             op = TestOp.build_generic(
                 results=[None, None, t3], operands=[v0, None, None]
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0ff72ec75f1d4..e1161e6dca3da 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -56,8 +56,8 @@ static const char *const propertyDiag = "propDiag";
 
 /// The names of the implicit attributes that contain variadic operand and
 /// result segment sizes.
-static const char *const operandSegmentAttrName = "operand_segment_sizes";
-static const char *const resultSegmentAttrName = "result_segment_sizes";
+static const char *const operandSegmentAttrName = "operandSegmentSizes";
+static const char *const resultSegmentAttrName = "resultSegmentSizes";
 
 /// Code for an Op to lookup an attribute. Uses cached identifiers and subrange
 /// lookup.
@@ -447,7 +447,7 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       operandSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
-      operandSegmentsSize = {"odsOperandSegmentSizes",
+      operandSegmentsSize = {"operandSegmentSizes",
                              makeProperty(operandSegmentsSizeStorage)};
     } else {
       attrMetadata.insert(
@@ -460,7 +460,7 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       resultSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
-      resultSegmentsSize = {"odsResultSegmentSizes",
+      resultSegmentsSize = {"resultSegmentSizes",
                             makeProperty(resultSegmentsSizeStorage)};
     } else {
       attrMetadata.insert(
@@ -1306,10 +1306,12 @@ void OpEmitter::genPropertiesSupport() {
       std::string getAttr;
       llvm::raw_string_ostream os(getAttr);
       os << "   auto attr = dict.get(\"" << name << "\");";
-      if (name == "odsOperandSegmentSizes") {
+      if (name == operandSegmentAttrName) {
+        // Backward compat for now, TODO: Remove at some point.
         os << "   if (!attr) attr = dict.get(\"operand_segment_sizes\");";
       }
-      if (name == "odsResultSegmentSizes") {
+      if (name == resultSegmentAttrName) {
+        // Backward compat for now, TODO: Remove at some point.
         os << "   if (!attr) attr = dict.get(\"result_segment_sizes\");";
       }
       os.flush();
@@ -1327,10 +1329,12 @@ void OpEmitter::genPropertiesSupport() {
       std::string getAttr;
       llvm::raw_string_ostream os(getAttr);
       os << "   auto attr = dict.get(\"" << name << "\");";
-      if (name == "odsOperandSegmentSizes") {
+      if (name == operandSegmentAttrName) {
+        // Backward compat for now
         os << "   if (!attr) attr = dict.get(\"operand_segment_sizes\");";
       }
-      if (name == "odsResultSegmentSizes") {
+      if (name == resultSegmentAttrName) {
+        // Backward compat for now
         os << "   if (!attr) attr = dict.get(\"result_segment_sizes\");";
       }
       os.flush();
@@ -1466,34 +1470,34 @@ void OpEmitter::genPropertiesSupport() {
     // even though it is a native property.
     const auto *namedProperty = cast<const NamedProperty *>(attrOrProp);
     StringRef name = namedProperty->name;
-    if (name != "odsOperandSegmentSizes" && name != "odsResultSegmentSizes")
+    if (name != operandSegmentAttrName && name != resultSegmentAttrName)
       continue;
     auto &prop = namedProperty->prop;
     FmtContext fctx;
     fctx.addSubst("_ctxt", "ctx");
     fctx.addSubst("_storage", Twine("prop.") + name);
-    if (name == "odsOperandSegmentSizes") {
+    if (name == operandSegmentAttrName) {
       getInherentAttrMethod
-          << formatv("    if (name == \"odsOperandSegmentSizes\" || name == "
+          << formatv("    if (name == \"operand_segment_sizes\" || name == "
                      "\"{0}\") return ",
                      operandSegmentAttrName);
     } else {
       getInherentAttrMethod
-          << formatv("    if (name == \"odsResultSegmentSizes\" || name == "
+          << formatv("    if (name == \"result_segment_sizes\" || name == "
                      "\"{0}\") return ",
                      resultSegmentAttrName);
     }
     getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
                           << ";\n";
 
-    if (name == "odsOperandSegmentSizes") {
-      setInherentAttrMethod << formatv(
-          "        if (name == \"odsOperandSegmentSizes\" || name == "
-          "\"{0}\") {{",
-          operandSegmentAttrName);
+    if (name == operandSegmentAttrName) {
+      setInherentAttrMethod
+          << formatv("        if (name == \"operand_segment_sizes\" || name == "
+                     "\"{0}\") {{",
+                     operandSegmentAttrName);
     } else {
       setInherentAttrMethod
-          << formatv("        if (name == \"odsResultSegmentSizes\" || name == "
+          << formatv("        if (name == \"result_segment_sizes\" || name == "
                      "\"{0}\") {{",
                      resultSegmentAttrName);
     }
@@ -1507,7 +1511,7 @@ void OpEmitter::genPropertiesSupport() {
     }
 )decl",
                                      name);
-    if (name == "odsOperandSegmentSizes") {
+    if (name == operandSegmentAttrName) {
       populateInherentAttrsMethod
           << formatv("  attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
                      tgfmt(prop.getConvertToAttributeCall(), &fctx));
@@ -2015,7 +2019,7 @@ void OpEmitter::genNamedOperandGetters() {
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     if (op.getDialect().usePropertiesForAttributes())
       attrSizeInitCode = formatv(adapterSegmentSizeAttrInitCodeProperties,
-                                 "getProperties().odsOperandSegmentSizes");
+                                 "getProperties().operandSegmentSizes");
 
     else
       attrSizeInitCode = formatv(opSegmentSizeAttrInitCode,
@@ -2057,7 +2061,7 @@ void OpEmitter::genNamedOperandSetters() {
         body << formatv(", ::mlir::MutableOperandRange::OperandSegment({0}u, "
                         "{{getOperandSegmentSizesAttrName(), "
                         "DenseI32ArrayAttr::get(getContext(), "
-                        "getProperties().odsOperandSegmentSizes)})",
+                        "getProperties().operandSegmentSizes)})",
                         i);
       else
         body << formatv(
@@ -2116,7 +2120,7 @@ void OpEmitter::genNamedResultGetters() {
   if (attrSizedResults) {
     if (op.getDialect().usePropertiesForAttributes())
       attrSizeInitCode = formatv(adapterSegmentSizeAttrInitCodeProperties,
-                                 "getProperties().odsResultSegmentSizes");
+                                 "getProperties().resultSegmentSizes");
 
     else
       attrSizeInitCode = formatv(opSegmentSizeAttrInitCode,
@@ -2291,7 +2295,7 @@ void OpEmitter::genSeparateArgParamBuilder() {
              << ");\n";
       }
 
-      // Automatically create the 'result_segment_sizes' attribute using
+      // Automatically create the 'resultSegmentSizes' attribute using
       // the length of the type ranges.
       if (op.getTrait("::mlir::OpTrait::AttrSizedResultSegments")) {
         if (op.getDialect().usePropertiesForAttributes()) {
@@ -2321,7 +2325,7 @@ void OpEmitter::genSeparateArgParamBuilder() {
         if (op.getDialect().usePropertiesForAttributes()) {
           body << "}), " << builderOpState
                << ".getOrAddProperties<Properties>()."
-                  "odsResultSegmentSizes.begin());\n";
+                  "resultSegmentSizes.begin());\n";
         } else {
           body << "}));\n";
         }
@@ -2947,7 +2951,7 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
       emitSegment();
       body << "}), " << builderOpState
            << ".getOrAddProperties<Properties>()."
-              "odsOperandSegmentSizes.begin());\n";
+              "operandSegmentSizes.begin());\n";
     } else {
       body << "  " << builderOpState << ".addAttribute(" << sizes << "AttrName("
            << builderOpState << ".name), "
@@ -3819,8 +3823,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
       if (attr) {
         storageType = attr->getStorageType();
       } else {
-        if (name != "odsOperandSegmentSizes" &&
-            name != "odsResultSegmentSizes") {
+        if (name != operandSegmentAttrName && name != resultSegmentAttrName) {
           report_fatal_error("unexpected AttributeMetadata");
         }
         // TODO: update to use native integers.
@@ -3935,7 +3938,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
     if (op.getDialect().usePropertiesForAttributes())
       sizeAttrInit =
           formatv(adapterSegmentSizeAttrInitCodeProperties,
-                  llvm::formatv("getProperties().odsOperandSegmentSizes"));
+                  llvm::formatv("getProperties().operandSegmentSizes"));
     else
       sizeAttrInit = formatv(adapterSegmentSizeAttrInitCode,
                              emitHelper.getAttr(operandSegmentAttrName));
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index c38f873ddaba4..5f33f7faf80c0 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1666,10 +1666,10 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op,
         llvm::interleaveComma(op.getOperands(), body, interleaveFn);
         body << formatv("}), "
                         "result.getOrAddProperties<{0}::Properties>()."
-                        "odsOperandSegmentSizes.begin());\n",
+                        "operandSegmentSizes.begin());\n",
                         op.getCppClassName());
       } else {
-        body << "  result.addAttribute(\"operand_segment_sizes\", "
+        body << "  result.addAttribute(\"operandSegmentSizes\", "
              << "parser.getBuilder().getDenseI32ArrayAttr({";
         llvm::interleaveComma(op.getOperands(), body, interleaveFn);
         body << "}));\n";
@@ -1710,10 +1710,10 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op,
       llvm::interleaveComma(op.getResults(), body, interleaveFn);
       body << formatv("}), "
                       "result.getOrAddProperties<{0}::Properties>()."
-                      "odsResultSegmentSizes.begin());\n",
+                      "resultSegmentSizes.begin());\n",
                       op.getCppClassName());
     } else {
-      body << "  result.addAttribute(\"result_segment_sizes\", "
+      body << "  result.addAttribute(\"resultSegmentSizes\", "
            << "parser.getBuilder().getDenseI32ArrayAttr({";
       llvm::interleaveComma(op.getResults(), body, interleaveFn);
       body << "}));\n";
@@ -1767,10 +1767,10 @@ static void genAttrDictPrinter(OperationFormat &fmt, Operator &op,
   // Elide the variadic segment size attributes if necessary.
   if (!fmt.allOperands &&
       op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments"))
-    body << "  elidedAttrs.push_back(\"operand_segment_sizes\");\n";
+    body << "  elidedAttrs.push_back(\"operandSegmentSizes\");\n";
   if (!fmt.allResultTypes &&
       op.getTrait("::mlir::OpTrait::AttrSizedResultSegments"))
-    body << "  elidedAttrs.push_back(\"result_segment_sizes\");\n";
+    body << "  elidedAttrs.push_back(\"resultSegmentSizes\");\n";
   for (const StringRef key : fmt.inferredAttributes.keys())
     body << "  elidedAttrs.push_back(\"" << key << "\");\n";
   for (const NamedAttribute *attr : fmt.usedAttributes)
diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
index dd6e52d300efe..7c7b991fb7b07 100644
--- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
@@ -170,7 +170,7 @@ constexpr const char *opVariadicSegmentTemplate = R"Py(
   def {0}(self):
     {1}_range = _ods_segmented_accessor(
          self.operation.{1}s,
-         self.operation.attributes["{1}_segment_sizes"], {2})
+         self.operation.attributes["{1}SegmentSizes"], {2})
     return {1}_range{3}
 )Py";
 
diff --git a/mlir/unittests/IR/AdaptorTest.cpp b/mlir/unittests/IR/AdaptorTest.cpp
index ec15d30875755..4a01d2c52b645 100644
--- a/mlir/unittests/IR/AdaptorTest.cpp
+++ b/mlir/unittests/IR/AdaptorTest.cpp
@@ -39,7 +39,7 @@ TEST(Adaptor, GenericAdaptorsOperandAccess) {
     // value from the value 0.
     SmallVector<std::optional<int>> v = {0, 4};
     OIListSimple::Properties prop;
-    prop.odsOperandSegmentSizes = {1, 0, 1};
+    prop.operandSegmentSizes = {1, 0, 1};
     OIListSimple::GenericAdaptor<ArrayRef<std::optional<int>>> d(v, {}, prop,
                                                                  {});
     EXPECT_EQ(d.getArg0(), 0);

From 67dca9da75b2661d14a34067551ea71aaae19cf8 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Sun, 20 Aug 2023 13:17:42 -0700
Subject: [PATCH 13/92] Fix MLIR build failure: error: no member named
 'getValue' in 'mlir::OptionalParseResult'

Fix #63072
---
 mlir/test/mlir-tblgen/attr-or-type-format.td | 2 +-
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/test/mlir-tblgen/attr-or-type-format.td b/mlir/test/mlir-tblgen/attr-or-type-format.td
index 230fa90713f1a..2782f55bc966e 100644
--- a/mlir/test/mlir-tblgen/attr-or-type-format.td
+++ b/mlir/test/mlir-tblgen/attr-or-type-format.td
@@ -648,5 +648,5 @@ def TypeN : TestType<"TestP"> {
 // DEFAULT_TYPE_PARSER: TestDialect::parseType(::mlir::DialectAsmParser &parser)
 // DEFAULT_TYPE_PARSER: auto parseResult = parseOptionalDynamicType(mnemonic, parser, genType);
 // DEFAULT_TYPE_PARSER: if (parseResult.has_value()) {
-// DEFAULT_TYPE_PARSER:   if (::mlir::succeeded(parseResult.getValue()))
+// DEFAULT_TYPE_PARSER:   if (::mlir::succeeded(parseResult.value()))
 // DEFAULT_TYPE_PARSER:     return genType;
\ No newline at end of file
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 943e323c6af40..f6e43d42d29f0 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -763,7 +763,7 @@ static const char *const dialectDynamicTypeParserDispatch = R"(
   {
     auto parseResult = parseOptionalDynamicType(mnemonic, parser, genType);
     if (parseResult.has_value()) {
-      if (::mlir::succeeded(parseResult.getValue()))
+      if (::mlir::succeeded(parseResult.value()))
         return genType;
       return ::mlir::Type();
     }

From 0d8fd074b7a5426541edd91774e5cee63dbe805d Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 22 Aug 2023 09:54:50 -0700
Subject: [PATCH 14/92] Fix some missing fully qualified namespaces in MLIR
 TableGen generator

Using properties would break when a dialect isn't in the mlir namespace
---
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 30 ++++++++++-----------
 mlir/tools/mlir-tblgen/OpFormatGen.cpp      |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index e1161e6dca3da..4e07bc3d48272 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -158,24 +158,24 @@ static const char *const valueRangeReturnCode = R"(
 /// Read operand/result segment_size from bytecode.
 static const char *const readBytecodeSegmentSize = R"(
 if ($_reader.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6) {
-  DenseI32ArrayAttr attr;
-  if (::mlir::failed($_reader.readAttribute(attr))) return failure();
+  ::mlir::DenseI32ArrayAttr attr;
+  if (::mlir::failed($_reader.readAttribute(attr))) return ::mlir::failure();
   if (attr.size() > static_cast<int64_t>(sizeof($_storage) / sizeof(int32_t))) {
     $_reader.emitError("size mismatch for operand/result_segment_size");
-    return failure();
+    return ::mlir::failure();
   }
-  llvm::copy(ArrayRef<int32_t>(attr), $_storage.begin());
+  llvm::copy(::llvm::ArrayRef<int32_t>(attr), $_storage.begin());
 } else {
-  return $_reader.readSparseArray(MutableArrayRef($_storage));
+  return $_reader.readSparseArray(::llvm::MutableArrayRef($_storage));
 }
 )";
 
 /// Write operand/result segment_size to bytecode.
 static const char *const writeBytecodeSegmentSize = R"(
 if ($_writer.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6)
-  $_writer.writeAttribute(DenseI32ArrayAttr::get(getContext(), $_storage));
+  $_writer.writeAttribute(::mlir::DenseI32ArrayAttr::get(getContext(), $_storage));
 else
-  $_writer.writeSparseArray(ArrayRef($_storage));
+  $_writer.writeSparseArray(::llvm::ArrayRef($_storage));
 )";
 
 /// A header for indicating code sections.
@@ -430,15 +430,15 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         /*interfaceType=*/"::llvm::ArrayRef<int32_t>",
         /*convertFromStorageCall=*/"$_storage",
         /*assignToStorageCall=*/
-        "llvm::copy($_value, $_storage.begin())",
+        "::llvm::copy($_value, $_storage.begin())",
         /*convertToAttributeCall=*/
-        "DenseI32ArrayAttr::get($_ctxt, $_storage)",
+        "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
         /*readFromMlirBytecodeCall=*/readBytecodeSegmentSize,
         /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSize,
         /*hashPropertyCall=*/
-        "llvm::hash_combine_range(std::begin($_storage), "
+        "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
         /*StringRef defaultValue=*/"");
   };
@@ -1449,7 +1449,7 @@ void OpEmitter::genPropertiesSupport() {
 )decl";
   const char *setInherentAttrMethodFmt = R"decl(
     if (name == "{0}") {{
-       prop.{0} = dyn_cast_or_null<std::remove_reference_t<decltype(prop.{0})>>(value);
+       prop.{0} = ::llvm::dyn_cast_or_null<std::remove_reference_t<decltype(prop.{0})>>(value);
        return;
     }
 )decl";
@@ -1502,7 +1502,7 @@ void OpEmitter::genPropertiesSupport() {
                      resultSegmentAttrName);
     }
     setInherentAttrMethod << formatv(R"decl(
-       auto arrAttr = dyn_cast_or_null<DenseI32ArrayAttr>(value);
+       auto arrAttr = ::llvm::dyn_cast_or_null<::mlir::DenseI32ArrayAttr>(value);
        if (!arrAttr) return;
        if (arrAttr.size() != sizeof(prop.{0}) / sizeof(int32_t))
          return;
@@ -2060,7 +2060,7 @@ void OpEmitter::genNamedOperandSetters() {
       if (emitHelper.hasProperties())
         body << formatv(", ::mlir::MutableOperandRange::OperandSegment({0}u, "
                         "{{getOperandSegmentSizesAttrName(), "
-                        "DenseI32ArrayAttr::get(getContext(), "
+                        "::mlir::DenseI32ArrayAttr::get(getContext(), "
                         "getProperties().operandSegmentSizes)})",
                         i);
       else
@@ -2299,7 +2299,7 @@ void OpEmitter::genSeparateArgParamBuilder() {
       // the length of the type ranges.
       if (op.getTrait("::mlir::OpTrait::AttrSizedResultSegments")) {
         if (op.getDialect().usePropertiesForAttributes()) {
-          body << "  llvm::copy(ArrayRef<int32_t>({";
+          body << "  ::llvm::copy(::llvm::ArrayRef<int32_t>({";
         } else {
           std::string getterName = op.getGetterName(resultSegmentAttrName);
           body << " " << builderOpState << ".addAttribute(" << getterName
@@ -2947,7 +2947,7 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     std::string sizes = op.getGetterName(operandSegmentAttrName);
     if (op.getDialect().usePropertiesForAttributes()) {
-      body << "  llvm::copy(ArrayRef<int32_t>({";
+      body << "  ::llvm::copy(::llvm::ArrayRef<int32_t>({";
       emitSegment();
       body << "}), " << builderOpState
            << ".getOrAddProperties<Properties>()."
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 5f33f7faf80c0..546d4616f7173 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1662,7 +1662,7 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op,
           body << "1";
       };
       if (op.getDialect().usePropertiesForAttributes()) {
-        body << "llvm::copy(ArrayRef<int32_t>({";
+        body << "::llvm::copy(::llvm::ArrayRef<int32_t>({";
         llvm::interleaveComma(op.getOperands(), body, interleaveFn);
         body << formatv("}), "
                         "result.getOrAddProperties<{0}::Properties>()."

From c5f0c32da7778e6e712b746dac35a628c86af265 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 22 Aug 2023 12:54:50 -0700
Subject: [PATCH 15/92] Fix MLIR pass manager initialization: hash the pass
 pipeline to detect when initialization is needed

The current logic hashes the context to detect registration changes and re-run
the pass initialization. However it wasn't checking for changes to the
pipeline, so a pass that would get added after a first run would not be
initialized during subsequent runs.

Reviewed By: Mogball

Differential Revision: https://reviews.llvm.org/D158377
---
 mlir/include/mlir/Pass/PassManager.h    |  8 +++++-
 mlir/lib/Pass/Pass.cpp                  | 22 ++++++++++++++-
 mlir/unittests/Pass/PassManagerTest.cpp | 36 +++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 75fe1524221c1..d5f1ea0fe0350 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -172,6 +172,10 @@ class OpPassManager {
   /// if a pass manager has already been initialized.
   LogicalResult initialize(MLIRContext *context, unsigned newInitGeneration);
 
+  /// Compute a hash of the pipeline, so that we can detect changes (a pass is
+  /// added...).
+  llvm::hash_code hash();
+
   /// A pointer to an internal implementation instance.
   std::unique_ptr<detail::OpPassManagerImpl> impl;
 
@@ -439,9 +443,11 @@ class PassManager : public OpPassManager {
   /// generate reproducers.
   std::unique_ptr<detail::PassCrashReproducerGenerator> crashReproGenerator;
 
-  /// A hash key used to detect when reinitialization is necessary.
+  /// Hash keys used to detect when reinitialization is necessary.
   llvm::hash_code initializationKey =
       DenseMapInfo<llvm::hash_code>::getTombstoneKey();
+  llvm::hash_code pipelineInitializationKey =
+      DenseMapInfo<llvm::hash_code>::getTombstoneKey();
 
   /// Flag that specifies if pass timing is enabled.
   bool passTiming : 1;
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index fe4597f3df3d2..44b83c22fd515 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/Threading.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Support/FileUtilities.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/CommandLine.h"
@@ -424,6 +425,23 @@ LogicalResult OpPassManager::initialize(MLIRContext *context,
   return success();
 }
 
+llvm::hash_code OpPassManager::hash() {
+  llvm::hash_code hashCode;
+  for (Pass &pass : getPasses()) {
+    // If this pass isn't an adaptor, directly hash it.
+    auto *adaptor = dyn_cast<OpToOpPassAdaptor>(&pass);
+    if (!adaptor) {
+      hashCode = llvm::hash_combine(hashCode, &pass);
+      continue;
+    }
+    // Otherwise, hash recursively each of the adaptors pass managers.
+    for (OpPassManager &adaptorPM : adaptor->getPassManagers())
+      llvm::hash_combine(hashCode, adaptorPM.hash());
+  }
+  return hashCode;
+}
+
+
 //===----------------------------------------------------------------------===//
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
@@ -825,10 +843,12 @@ LogicalResult PassManager::run(Operation *op) {
 
   // Initialize all of the passes within the pass manager with a new generation.
   llvm::hash_code newInitKey = context->getRegistryHash();
-  if (newInitKey != initializationKey) {
+  llvm::hash_code pipelineKey = hash();
+  if (newInitKey != initializationKey || pipelineKey != pipelineInitializationKey) {
     if (failed(initialize(context, impl->initializationGeneration + 1)))
       return failure();
     initializationKey = newInitKey;
+    pipelineKey = pipelineInitializationKey;
   }
 
   // Construct a top level analysis manager for the pipeline.
diff --git a/mlir/unittests/Pass/PassManagerTest.cpp b/mlir/unittests/Pass/PassManagerTest.cpp
index 97349d681c3a0..70a679125c0ea 100644
--- a/mlir/unittests/Pass/PassManagerTest.cpp
+++ b/mlir/unittests/Pass/PassManagerTest.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
 #include "gtest/gtest.h"
 
@@ -144,4 +145,39 @@ TEST(PassManagerTest, InvalidPass) {
                "intend to nest?");
 }
 
+/// Simple pass to annotate a func::FuncOp with the results of analysis.
+struct InitializeCheckingPass
+    : public PassWrapper<InitializeCheckingPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InitializeCheckingPass)
+  LogicalResult initialize(MLIRContext *ctx) final {
+    initialized = true;
+    return success();
+  }
+  bool initialized = false;
+
+  void runOnOperation() override {
+    if (!initialized) {
+      getOperation()->emitError() << "Pass isn't initialized!";
+      signalPassFailure();
+    }
+  }
+};
+
+TEST(PassManagerTest, PassInitialization) {
+  MLIRContext context;
+  context.allowUnregisteredDialects();
+
+  // Create a module
+  OwningOpRef<ModuleOp> module(ModuleOp::create(UnknownLoc::get(&context)));
+
+  // Instantiate and run our pass.
+  auto pm = PassManager::on<ModuleOp>(&context);
+  pm.addPass(std::make_unique<InitializeCheckingPass>());
+  EXPECT_TRUE(succeeded(pm.run(module.get())));
+
+  // Adding a second copy of the pass, we should also initialize it!
+  pm.addPass(std::make_unique<InitializeCheckingPass>());
+  EXPECT_TRUE(succeeded(pm.run(module.get())));
+}
+
 } // namespace

From 69946c8c9ce76f3bf6fbac789ef426a468d6eb5a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 22 Aug 2023 16:27:06 -0700
Subject: [PATCH 16/92] Fix MSAN error: use of unitialized value when hashing
 the MLIR pass manager (NFC)

---
 mlir/lib/Pass/Pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index 44b83c22fd515..a562a00eb1953 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -426,7 +426,7 @@ LogicalResult OpPassManager::initialize(MLIRContext *context,
 }
 
 llvm::hash_code OpPassManager::hash() {
-  llvm::hash_code hashCode;
+  llvm::hash_code hashCode{};
   for (Pass &pass : getPasses()) {
     // If this pass isn't an adaptor, directly hash it.
     auto *adaptor = dyn_cast<OpToOpPassAdaptor>(&pass);

From a95298a0ed4b42dc79ebda2db1f8f371447c37db Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 8 Aug 2023 18:58:12 -0700
Subject: [PATCH 17/92] Fix canonicalizer to copy the entire
 GreedyRewriteConfig instead of selected fields

It is surprising for the user that only some fields were honored.

Also make the FrozenRewritePatternSet a shared_ptr<const T>.

Fixes #64543

Differential Revision: https://reviews.llvm.org/D157469
---
 mlir/include/mlir/IR/OpImplementation.h | 14 ++++++++------
 mlir/lib/Transforms/Canonicalizer.cpp   | 24 +++++++++++++-----------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 0eeb8bb1ec8da..2131fe313f8c5 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -715,18 +715,20 @@ class AsmParser {
   //===--------------------------------------------------------------------===//
 
   /// This class represents a StringSwitch like class that is useful for parsing
-  /// expected keywords. On construction, it invokes `parseKeyword` and
-  /// processes each of the provided cases statements until a match is hit. The
-  /// provided `ResultT` must be assignable from `failure()`.
+  /// expected keywords. On construction, unless a non-empty keyword is
+  /// provided, it invokes `parseKeyword` and processes each of the provided
+  /// cases statements until a match is hit. The provided `ResultT` must be
+  /// assignable from `failure()`.
   template <typename ResultT = ParseResult>
   class KeywordSwitch {
   public:
-    KeywordSwitch(AsmParser &parser)
+    KeywordSwitch(AsmParser &parser, StringRef *keyword = nullptr)
         : parser(parser), loc(parser.getCurrentLocation()) {
-      if (failed(parser.parseKeywordOrCompletion(&keyword)))
+      if (keyword && !keyword->empty())
+        this->keyword = *keyword;
+      else if (failed(parser.parseKeywordOrCompletion(&this->keyword)))
         result = failure();
     }
-
     /// Case that uses the provided value when true.
     KeywordSwitch &Case(StringLiteral str, ResultT value) {
       return Case(str, [&](StringRef, SMLoc) { return std::move(value); });
diff --git a/mlir/lib/Transforms/Canonicalizer.cpp b/mlir/lib/Transforms/Canonicalizer.cpp
index b4ad85c7c7dad..d50019bd6aee5 100644
--- a/mlir/lib/Transforms/Canonicalizer.cpp
+++ b/mlir/lib/Transforms/Canonicalizer.cpp
@@ -29,7 +29,8 @@ struct Canonicalizer : public impl::CanonicalizerBase<Canonicalizer> {
   Canonicalizer() = default;
   Canonicalizer(const GreedyRewriteConfig &config,
                 ArrayRef<std::string> disabledPatterns,
-                ArrayRef<std::string> enabledPatterns) {
+                ArrayRef<std::string> enabledPatterns)
+      : config(config) {
     this->topDownProcessingEnabled = config.useTopDownTraversal;
     this->enableRegionSimplification = config.enableRegionSimplification;
     this->maxIterations = config.maxIterations;
@@ -41,30 +42,31 @@ struct Canonicalizer : public impl::CanonicalizerBase<Canonicalizer> {
   /// Initialize the canonicalizer by building the set of patterns used during
   /// execution.
   LogicalResult initialize(MLIRContext *context) override {
+    // Set the config from possible pass options set in the meantime.
+    config.useTopDownTraversal = topDownProcessingEnabled;
+    config.enableRegionSimplification = enableRegionSimplification;
+    config.maxIterations = maxIterations;
+    config.maxNumRewrites = maxNumRewrites;
+
     RewritePatternSet owningPatterns(context);
     for (auto *dialect : context->getLoadedDialects())
       dialect->getCanonicalizationPatterns(owningPatterns);
     for (RegisteredOperationName op : context->getRegisteredOperations())
       op.getCanonicalizationPatterns(owningPatterns, context);
 
-    patterns = FrozenRewritePatternSet(std::move(owningPatterns),
-                                       disabledPatterns, enabledPatterns);
+    patterns = std::make_shared<FrozenRewritePatternSet>(
+        std::move(owningPatterns), disabledPatterns, enabledPatterns);
     return success();
   }
   void runOnOperation() override {
-    GreedyRewriteConfig config;
-    config.useTopDownTraversal = topDownProcessingEnabled;
-    config.enableRegionSimplification = enableRegionSimplification;
-    config.maxIterations = maxIterations;
-    config.maxNumRewrites = maxNumRewrites;
     LogicalResult converged =
-        applyPatternsAndFoldGreedily(getOperation(), patterns, config);
+        applyPatternsAndFoldGreedily(getOperation(), *patterns, config);
     // Canonicalization is best-effort. Non-convergence is not a pass failure.
     if (testConvergence && failed(converged))
       signalPassFailure();
   }
-
-  FrozenRewritePatternSet patterns;
+  GreedyRewriteConfig config;
+  std::shared_ptr<const FrozenRewritePatternSet> patterns;
 };
 } // namespace
 

From 5e47fe1945952aa7719f1a139cdabd7b37cf6511 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 23 Aug 2023 15:52:57 -0700
Subject: [PATCH 18/92] Fix ODS verifier emission for DerivedAttr when
 Properties are enabled

Differential Revision: https://reviews.llvm.org/D158679
---
 mlir/test/lib/Dialect/Test/TestOps.td       | 10 ++++++++++
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 4eb19e6dd6fe2..12a02cf72d2b3 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -264,6 +264,16 @@ def DerivedTypeAttrOp : TEST_Op<"derived_type_attr", []> {
     "$_builder.getI32IntegerAttr($_self)">;
 }
 
+def TestPropOp : TEST_Op<"prop">,
+  Arguments<(ins Variadic<Index>:$upperInits,
+      I32ElementsAttr:$transforms)>,
+  Results<(outs Variadic<AnyType>:$results)> {
+  DerivedAttr upperLen = DerivedAttr<"uint32_t", [{
+    return getUpperInits().size() / getTransforms().size();
+  }], [{ $_builder.getI32IntegerAttr($_self) }]>;
+}
+
+
 def StringElementsAttrOp : TEST_Op<"string_elements_attr"> {
   let arguments = (ins
       StringElementsAttr:$scalar_string_attr
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 4e07bc3d48272..a3b9c71048422 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -868,10 +868,12 @@ while (true) {{
   if (useProperties) {
     for (const std::pair<StringRef, AttributeMetadata> &it :
          emitHelper.getAttrMetadata()) {
+      const AttributeMetadata &metadata = it.second;
+      if (metadata.constraint && metadata.constraint->isDerivedAttr())
+        continue;
       body << formatv(
           "auto tblgen_{0} = getProperties().{0}; (void)tblgen_{0};\n",
           it.first);
-      const AttributeMetadata &metadata = it.second;
       if (metadata.isRequired)
         body << formatv(
             "if (!tblgen_{0}) return {1}\"requires attribute '{0}'\");\n",

From 4b414e52ac10e04de26ad66ca4590605e836af24 Mon Sep 17 00:00:00 2001
From: XinWang10 <xin10.wang@intel.com>
Date: Fri, 18 Aug 2023 00:08:01 -0700
Subject: [PATCH 19/92] Fix regression of D157680

Test cases in D157680 should be target specific, but miss some limit, add them back to make buildbot pass.

Reviewed By: skan, Hahnfeld

Differential Revision: https://reviews.llvm.org/D158252
---
 llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
index e3f3622f146d9..33250b3495a00 100644
--- a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
+++ b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll
@@ -1,6 +1,6 @@
 ; Check that if option prefer-no-gather/scatter can disable gather/scatter instructions.
-; RUN: llc -mattr=+avx2,+fast-gather %s -o - | FileCheck %s --check-prefixes=GATHER
-; RUN: llc -mattr=+avx2,+fast-gather,+prefer-no-gather %s -o - | FileCheck %s --check-prefixes=NO-GATHER
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather %s -o - | FileCheck %s --check-prefixes=GATHER
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather,+prefer-no-gather %s -o - | FileCheck %s --check-prefixes=NO-GATHER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq < %s | FileCheck %s --check-prefix=SCATTER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq,+prefer-no-gather < %s | FileCheck %s --check-prefix=SCATTER-NO-GATHER
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl,+avx512dq,+prefer-no-scatter < %s | FileCheck %s --check-prefix=GATHER-NO-SCATTER

From 01f7134c0ea8caff86828df3f979726dffa33924 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 14 Aug 2023 16:52:02 -0700
Subject: [PATCH 20/92] [libc++] Fix problems with GCC 13 and switch to it in
 the CI

Reviewed By: #libc, #libc_abi, Mordante

Spies: arphaman, Mordante, libcxx-commits, arichardson

Differential Revision: https://reviews.llvm.org/D157060
---
 libcxx/include/__type_traits/is_nothrow_constructible.h   | 3 ++-
 libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp     | 2 +-
 .../std/algorithms/robust_against_adl.compile.pass.cpp    | 2 +-
 .../format.formatter.spec/formatter.char_array.pass.cpp   | 2 +-
 .../std/utilities/meta/meta.rel/is_convertible.pass.cpp   | 3 +++
 .../meta/meta.rel/is_convertible_fallback.pass.cpp        | 2 ++
 libcxx/utils/libcxx/test/params.py                        | 8 ++++++++
 libcxxabi/test/catch_member_function_pointer_02.pass.cpp  | 2 +-
 8 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h
index d4686d89fd96e..4949062433b78 100644
--- a/libcxx/include/__type_traits/is_nothrow_constructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_constructible.h
@@ -22,7 +22,8 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if __has_builtin(__is_nothrow_constructible)
+// GCC is disabled due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611
+#if __has_builtin(__is_nothrow_constructible) && !defined(_LIBCPP_COMPILER_GCC)
 
 template < class _Tp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS is_nothrow_constructible
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index cdae8e8834e65..d75951fdf890e 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -15,7 +15,7 @@
 // UNSUPPORTED: clang-15, clang-16, clang-17
 
 // TODO: Investigate this failure on GCC 12 (in Ubuntu Jammy)
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 // RUN: %{cxx} %{flags} %s -o %t.exe %{compile_flags} -g %{link_flags}
 // Ensure locale-independence for unicode tests.
diff --git a/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp b/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
index 1411796d65963..77c88873073c9 100644
--- a/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
+++ b/libcxx/test/std/algorithms/robust_against_adl.compile.pass.cpp
@@ -11,7 +11,7 @@
 // https://buildkite.com/llvm-project/libcxx-ci/builds/15823#0184fc0b-d56b-4774-9e1d-35fe24e09e37
 // It seems like the CI gcc version is buggy. I can't reproduce the failure on my system or on
 // godbolt (https://godbolt.org/z/rsPv8e8fn).
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 #include <algorithm>
 #include <cstddef>
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index 70ddab63f0c41..84e2c8ab1af0c 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -7,7 +7,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // TODO FMT __builtin_memcpy isn't constexpr in GCC
-// UNSUPPORTED: gcc-12
+// UNSUPPORTED: gcc-12, gcc-13
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
index 804650fde3f3e..b96c9b11e2962 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_convertible.pass.cpp
@@ -117,9 +117,12 @@ int main(int, char**)
 
     // Non-referencable function type
     static_assert((!std::is_convertible<ConstFunction, Function>::value), "");
+// TODO(LLVM-19): Re-enable this once we switch to GCC 14. This is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109680
+#ifndef TEST_COMPILER_GCC
     static_assert((!std::is_convertible<ConstFunction, Function*>::value), "");
     static_assert((!std::is_convertible<ConstFunction, Function&>::value), "");
     static_assert((!std::is_convertible<ConstFunction, Function&&>::value), "");
+#endif
     static_assert((!std::is_convertible<Function*, ConstFunction>::value), "");
     static_assert((!std::is_convertible<Function&, ConstFunction>::value), "");
     static_assert((!std::is_convertible<ConstFunction, ConstFunction>::value), "");
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
index 28495cfebd45c..6e420d63dbd59 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_convertible_fallback.pass.cpp
@@ -10,6 +10,8 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D _LIBCPP_USE_IS_CONVERTIBLE_FALLBACK
 
+// UNSUPPORTED: gcc-13
+
 // type_traits
 
 // is_convertible
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index f43c634a1644f..f1401d7afc635 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -55,6 +55,14 @@
     # Don't fail compilation in case the compiler fails to perform the requested
     # loop vectorization.
     "-Wno-pass-failed",
+
+    # TODO: Find out why GCC warns in lots of places (is this a problem with always_inline?)
+    "-Wno-dangling-reference",
+    "-Wno-mismatched-new-delete",
+    "-Wno-redundant-move",
+
+    # This doesn't make sense in real code, but we have to test it because the standard requires us to not break
+    "-Wno-self-move",
 ]
 
 _allStandards = ["c++03", "c++11", "c++14", "c++17", "c++20", "c++23", "c++26"]
diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
index 3236f9aae1de1..667447db1e68a 100644
--- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
+++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
@@ -15,7 +15,7 @@
 
 // GCC supports noexcept function types but this test still fails.
 // This is likely a bug in their implementation. Investigation needed.
-// XFAIL: gcc-11, gcc-12
+// XFAIL: gcc-11, gcc-12, gcc-13
 
 #include <cassert>
 

From 231dab0e40e7f654bf65b9182d00df7b0f8f9551 Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Thu, 10 Aug 2023 11:22:48 -0700
Subject: [PATCH 21/92] [libc++] Move header_information.py up from tests

I need to use header_information.py in a generator script that isn't for tests in an upcoming change. Move it up a level so that it's in utils/libcxx instead of utils/libcxx/tests.

Differential Revision: https://reviews.llvm.org/D157639

# Conflicts:
#	libcxx/test/libcxx/system_reserved_names.gen.py
---
 libcxx/docs/Contributing.rst                                    | 2 +-
 .../test/libcxx/assertions/headers_declare_verbose_abort.gen.py | 2 +-
 libcxx/test/libcxx/clang_tidy.gen.py                            | 2 +-
 libcxx/test/libcxx/double_include.gen.py                        | 2 +-
 libcxx/test/libcxx/header_inclusions.gen.py                     | 2 +-
 libcxx/test/libcxx/libcpp_version.gen.py                        | 2 +-
 libcxx/test/libcxx/module_std.gen.py                            | 2 +-
 libcxx/test/libcxx/modules_include.gen.py                       | 2 +-
 libcxx/test/libcxx/nasty_macros.gen.py                          | 2 +-
 libcxx/test/libcxx/no_assert_include.gen.py                     | 2 +-
 libcxx/test/libcxx/transitive_includes.gen.py                   | 2 +-
 libcxx/utils/libcxx/{test => }/header_information.py            | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)
 rename libcxx/utils/libcxx/{test => }/header_information.py (99%)

diff --git a/libcxx/docs/Contributing.rst b/libcxx/docs/Contributing.rst
index cfae4d67ffdf4..3e3032ece99e4 100644
--- a/libcxx/docs/Contributing.rst
+++ b/libcxx/docs/Contributing.rst
@@ -49,7 +49,7 @@ sure you don't forget anything:
 
   - Did you add it to ``include/module.modulemap.in``?
   - Did you add it to ``include/CMakeLists.txt``?
-  - If it's a public header, did you update ``utils/libcxx/test/header_information.py``?
+  - If it's a public header, did you update ``utils/libcxx/header_information.py``?
 
 - Did you add the relevant feature test macro(s) for your feature? Did you update the ``generate_feature_test_macro_components.py`` script with it?
 - Did you run the ``libcxx-generate-files`` target and verify its output?
diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
index f72c257402936..a4e1c3c29c936 100644
--- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
+++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py
@@ -14,7 +14,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
     # Skip C compatibility headers.
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index a7b8e7b3ec549..b2f1a171507d1 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index ad18121d53be0..85055dfc703de 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
index f41ac27b651b3..cdbc5b34b5152 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
+from libcxx.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
 
 for header in public_headers:
   header_guard = lambda h: f"_LIBCPP_{h.upper().replace('.', '_').replace('/', '_')}"
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/libcxx/libcpp_version.gen.py
index 6a43d5dc3e4ae..47439b08fe51b 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/libcxx/libcpp_version.gen.py
@@ -12,7 +12,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   print(f"""\
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index 787317888d20d..db0678e221bef 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -21,7 +21,7 @@
 import sys
 
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import toplevel_headers
+from libcxx.header_information import toplevel_headers
 
 BLOCKLIT = (
     ""  # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/modules_include.gen.py b/libcxx/test/libcxx/modules_include.gen.py
index b6bad1b8a104d..8ca50b0877eef 100644
--- a/libcxx/test/libcxx/modules_include.gen.py
+++ b/libcxx/test/libcxx/modules_include.gen.py
@@ -14,7 +14,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
diff --git a/libcxx/test/libcxx/nasty_macros.gen.py b/libcxx/test/libcxx/nasty_macros.gen.py
index fdc308416f341..3c501a981d033 100644
--- a/libcxx/test/libcxx/nasty_macros.gen.py
+++ b/libcxx/test/libcxx/nasty_macros.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   print(f"""\
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/libcxx/no_assert_include.gen.py
index 45152a35f3177..a5e733d2b48a1 100644
--- a/libcxx/test/libcxx/no_assert_include.gen.py
+++ b/libcxx/test/libcxx/no_assert_include.gen.py
@@ -13,7 +13,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
   if header == 'cassert':
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index c446ceff7fef6..2ac5277878bee 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -20,7 +20,7 @@
 
 import sys
 sys.path.append(sys.argv[1])
-from libcxx.test.header_information import lit_header_restrictions, public_headers
+from libcxx.header_information import lit_header_restrictions, public_headers
 
 import re
 
diff --git a/libcxx/utils/libcxx/test/header_information.py b/libcxx/utils/libcxx/header_information.py
similarity index 99%
rename from libcxx/utils/libcxx/test/header_information.py
rename to libcxx/utils/libcxx/header_information.py
index 9ca0e9548c724..f23a896180b31 100644
--- a/libcxx/utils/libcxx/test/header_information.py
+++ b/libcxx/utils/libcxx/header_information.py
@@ -136,7 +136,7 @@ def is_header(file):
         and file.name != "libcxx.imp"
     )
 
-libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
+libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 include = pathlib.Path(os.path.join(libcxx_root, "include"))
 test = pathlib.Path(os.path.join(libcxx_root, "test"))
 assert libcxx_root.exists()

From ae498406499af87687d16efada56144f1639c424 Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Mon, 7 Aug 2023 22:54:23 -0700
Subject: [PATCH 22/92] [libc++][Modules] Remove unnecessary `requires` from
 the module map

Top level modules don't need `requires` because they're only built when their headers are included.

Reviewed By: ldionne, Mordante, #libc

Differential Revision: https://reviews.llvm.org/D157363
---
 libcxx/include/module.modulemap.in | 36 ------------------------------
 1 file changed, 36 deletions(-)

diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index bbb7090fd4bea..37a9edcd7ece1 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -16,7 +16,6 @@ module std_atomic [system] {
   export *
 }
 module std_barrier [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "barrier"
   export *
 }
@@ -37,7 +36,6 @@ module std_chrono [system] {
   export *
 }
 module std_codecvt [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "codecvt"
   export *
 }
@@ -78,7 +76,6 @@ module std_expected [system] {
   export *
 }
 module std_filesystem [system] {
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "filesystem"
   export *
 }
@@ -91,8 +88,6 @@ module std_forward_list [system] {
   export *
 }
 module std_fstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
-  @requires_LIBCXX_ENABLE_FILESYSTEM@
   header "fstream"
   export *
 }
@@ -101,7 +96,6 @@ module std_functional [system] {
   export *
 }
 module std_future [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "future"
   export *
 }
@@ -110,12 +104,10 @@ module std_initializer_list [system] {
   export *
 }
 module std_iomanip [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iomanip"
   export *
 }
 module std_ios [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ios"
   export *
 }
@@ -124,12 +116,10 @@ module std_iosfwd [system] {
   export *
 }
 module std_iostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "iostream"
   export *
 }
 module std_istream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "istream"
   export *
 }
@@ -138,7 +128,6 @@ module std_iterator [system] {
   export *
 }
 module std_latch [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "latch"
   export *
 }
@@ -151,7 +140,6 @@ module std_list [system] {
   export *
 }
 module std_locale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale"
   export *
 }
@@ -192,7 +180,6 @@ module std_optional [system] {
   export *
 }
 module std_ostream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "ostream"
   export *
 }
@@ -217,7 +204,6 @@ module std_ratio [system] {
   export *
 }
 module std_regex [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "regex"
   export *
 }
@@ -226,7 +212,6 @@ module std_scoped_allocator [system] {
   export *
 }
 module std_semaphore [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "semaphore"
   export *
 }
@@ -235,7 +220,6 @@ module std_set [system] {
   export *
 }
 module std_shared_mutex [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "shared_mutex"
   export std_version
 }
@@ -250,7 +234,6 @@ module std_span [system] {
   export std_private_span_span_fwd
 }
 module std_sstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "sstream"
   export *
 }
@@ -263,12 +246,10 @@ module std_stdexcept [system] {
   export *
 }
 module std_stop_token {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "stop_token"
   export *
 }
 module std_streambuf [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "streambuf"
   export *
 }
@@ -281,7 +262,6 @@ module std_string_view [system] {
   export *
 }
 module std_strstream [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "strstream"
   export *
 }
@@ -290,7 +270,6 @@ module std_system_error [system] {
   export *
 }
 module std_thread [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
   header "thread"
   export *
 }
@@ -377,7 +356,6 @@ module std_climits [system] {
   export *
 }
 module std_clocale [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "clocale"
   export *
 }
@@ -435,12 +413,10 @@ module std_cuchar [system] {
   export *
 }
 module std_cwchar [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwchar"
   export *
 }
 module std_cwctype [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "cwctype"
   export *
 }
@@ -477,7 +453,6 @@ module std_limits_h [system] {
   export *
 }
 module std_locale_h [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "locale.h"
   export *
 }
@@ -493,8 +468,6 @@ module std_setjmp_h [system] {
 // FIXME: <stdalign.h> is missing.
 // <stdarg.h> provided by compiler.
 module std_stdatomic_h [system] {
-  @requires_LIBCXX_ENABLE_THREADS@
-  requires cplusplus23
   header "stdatomic.h"
   export *
 }
@@ -536,21 +509,17 @@ module std_uchar_h [system] {
 }
 // <time.h> provided by C library.
 module std_wchar_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   // <wchar.h>'s __need_* macros require textual inclusion.
   textual header "wchar.h"
   export *
 }
 module std_wctype_h [system] {
-  @requires_LIBCXX_ENABLE_WIDE_CHARACTERS@
   header "wctype.h"
   export *
 }
 
 // Experimental C++ standard library interfaces
 module std_experimental [system] {
-  requires cplusplus11
-
   module deque {
     header "experimental/deque"
     export *
@@ -657,7 +626,6 @@ module std_private_hash_table        [system] {
   export *
 }
 module std_private_locale            [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__locale"
   export *
 }
@@ -1196,7 +1164,6 @@ module std_private_chrono_duration               [system] {
 }
 module std_private_chrono_file_clock             [system] { header "__chrono/file_clock.h" }
 module std_private_chrono_formatter              [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/formatter.h"
 }
 module std_private_chrono_hh_mm_ss               [system] { header "__chrono/hh_mm_ss.h" }
@@ -1210,11 +1177,9 @@ module std_private_chrono_month                  [system] { header "__chrono/mon
 module std_private_chrono_month_weekday          [system] { header "__chrono/month_weekday.h" }
 module std_private_chrono_monthday               [system] { header "__chrono/monthday.h" }
 module std_private_chrono_ostream                [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/ostream.h"
 }
 module std_private_chrono_parser_std_format_spec [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__chrono/parser_std_format_spec.h"
 }
 module std_private_chrono_statically_widen       [system] { header "__chrono/statically_widen.h" }
@@ -1699,7 +1664,6 @@ module std_private_ranges_filter_view                [system] {
 module std_private_ranges_from_range                 [system] { header "__ranges/from_range.h" }
 module std_private_ranges_iota_view                  [system] { header "__ranges/iota_view.h" }
 module std_private_ranges_istream_view               [system] {
-  @requires_LIBCXX_ENABLE_LOCALIZATION@
   header "__ranges/istream_view.h"
 }
 module std_private_ranges_join_view                  [system] {

From 5e2c5225f8f4d22c2a220d984740a9cf37df80a9 Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Wed, 9 Aug 2023 13:54:38 -0700
Subject: [PATCH 23/92] [libc++][Modules] Generate the __std_clang_module
 header

Use header_information to generate the __std_clang_module header. Instead of using lit_header_restrictions like the manually written header did, make a new header_include_requirements to codify what can be included rather than what can be fully tested.

Reviewed By: Mordante, #libc

Differential Revision: https://reviews.llvm.org/D157364

# Conflicts:
#	libcxx/utils/libcxx/header_information.py
---
 libcxx/include/__std_clang_module             | 188 ++++++++----------
 libcxx/test/libcxx/modules_include.gen.py     |   9 +-
 libcxx/utils/CMakeLists.txt                   |   5 +
 .../utils/generate_std_clang_module_header.py |  82 ++++++++
 libcxx/utils/libcxx/header_information.py     |  46 +++++
 5 files changed, 215 insertions(+), 115 deletions(-)
 create mode 100644 libcxx/utils/generate_std_clang_module_header.py

diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module
index 61a926eb6307e..46f50e87515b5 100644
--- a/libcxx/include/__std_clang_module
+++ b/libcxx/include/__std_clang_module
@@ -7,6 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// WARNING, this entire header is generated by
+// utils/generate_std_clang_module_header.py
+// DO NOT MODIFY!
+
 // This header should not be directly included, it's exclusively to import all
 // of the libc++ public clang modules for the `std` clang module to export. In
 // other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
@@ -17,7 +21,6 @@
 #  error "Do not include this header directly, include individual headers instead"
 #endif
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,52 +30,109 @@
 #include <algorithm>
 #include <any>
 #include <array>
-#include <atomic>
 #include <bit>
 #include <bitset>
+#include <cassert>
+#include <ccomplex>
+#include <cctype>
+#include <cerrno>
+#include <cfenv>
+#include <cfloat>
 #include <charconv>
 #include <chrono>
+#include <cinttypes>
+#include <ciso646>
+#include <climits>
+#include <cmath>
 #include <compare>
+#include <complex.h>
 #include <complex>
 #include <concepts>
 #include <condition_variable>
+#include <coroutine>
+#include <csetjmp>
+#include <csignal>
+#include <cstdarg>
+#include <cstdbool>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctgmath>
+#include <ctime>
+#include <ctype.h>
+#include <cuchar>
 #include <deque>
+#include <errno.h>
 #include <exception>
 #include <execution>
 #include <expected>
+#include <experimental/deque>
+#include <experimental/forward_list>
+#include <experimental/iterator>
+#include <experimental/list>
+#include <experimental/map>
+#include <experimental/memory_resource>
+#include <experimental/propagate_const>
+#include <experimental/set>
+#include <experimental/simd>
+#include <experimental/string>
+#include <experimental/type_traits>
+#include <experimental/unordered_map>
+#include <experimental/unordered_set>
+#include <experimental/utility>
+#include <experimental/vector>
+#include <fenv.h>
+#include <filesystem>
+#include <float.h>
 #include <format>
 #include <forward_list>
 #include <functional>
 #include <initializer_list>
+#include <inttypes.h>
 #include <iosfwd>
 #include <iterator>
+#include <limits.h>
 #include <limits>
 #include <list>
 #include <map>
+#include <math.h>
 #include <mdspan>
 #include <memory>
 #include <memory_resource>
+#include <mutex>
 #include <new>
 #include <numbers>
 #include <numeric>
 #include <optional>
+#include <print>
 #include <queue>
 #include <random>
 #include <ranges>
 #include <ratio>
 #include <scoped_allocator>
 #include <set>
+#include <setjmp.h>
 #include <source_location>
 #include <span>
 #include <stack>
+#include <stdbool.h>
+#include <stddef.h>
 #include <stdexcept>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include <string>
 #include <string_view>
 #include <system_error>
+#include <tgmath.h>
 #include <tuple>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
+#include <uchar.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -81,132 +141,42 @@
 #include <vector>
 #include <version>
 
-#include <cassert>
-#include <ccomplex>
-#include <cctype>
-#include <cerrno>
-#include <cfenv>
-#include <cfloat>
-#include <cinttypes>
-#include <ciso646>
-#include <climits>
-#include <cmath>
-#include <csetjmp>
-#include <csignal>
-#include <cstdarg>
-#include <cstdbool>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctgmath>
-#include <ctime>
-#include <cuchar>
-
-#include <complex.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fenv.h>
-#include <float.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <math.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tgmath.h>
-#include <uchar.h>
+#ifndef _LIBCPP_HAS_NO_ATOMIC_HEADER
+#  include <atomic>
+#  include <stdatomic.h>
+#endif
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
+#  include <clocale>
 #  include <codecvt>
-#  ifndef _LIBCPP_HAS_NO_FILESYSTEM
-#    include <fstream>
-#  endif
+#  include <experimental/regex>
+#  include <fstream>
 #  include <iomanip>
 #  include <ios>
 #  include <iostream>
 #  include <istream>
+#  include <locale.h>
 #  include <locale>
 #  include <ostream>
 #  include <regex>
 #  include <sstream>
 #  include <streambuf>
 #  include <strstream>
+#endif
 
-#  include <clocale>
-
-#  include <locale.h>
+#ifndef _LIBCPP_HAS_NO_THREADS
+#  include <barrier>
+#  include <future>
+#  include <latch>
+#  include <semaphore>
+#  include <shared_mutex>
+#  include <stop_token>
+#  include <thread>
 #endif
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #  include <cwchar>
 #  include <cwctype>
-
 #  include <wchar.h>
 #  include <wctype.h>
 #endif
-
-#ifdef _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
-#  include <print>
-#endif
-
-#ifndef _LIBCPP_CXX03_LANG
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <future>
-#    include <mutex>
-#    include <thread>
-#  endif
-
-#  include <experimental/deque>
-#  include <experimental/forward_list>
-#  include <experimental/iterator>
-#  include <experimental/list>
-#  include <experimental/map>
-#  include <experimental/memory_resource>
-#  include <experimental/propagate_const>
-#  ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#    include <experimental/regex>
-#  endif
-#  include <experimental/set>
-#  include <experimental/simd>
-#  include <experimental/string>
-#  include <experimental/type_traits>
-#  include <experimental/unordered_map>
-#  include <experimental/unordered_set>
-#  include <experimental/utility>
-#  include <experimental/vector>
-#endif
-
-#if _LIBCPP_STD_VER >= 14
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <shared_mutex>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 17
-#  ifndef _LIBCPP_HAS_NO_FILESYSTEM
-#    include <filesystem>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 20
-#  include <coroutine>
-
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <barrier>
-#    include <latch>
-#    include <semaphore>
-#    include <stop_token>
-#  endif
-#endif
-
-#if _LIBCPP_STD_VER >= 23
-#  ifndef _LIBCPP_HAS_NO_THREADS
-#    include <stdatomic.h>
-#  endif
-#endif
diff --git a/libcxx/test/libcxx/modules_include.gen.py b/libcxx/test/libcxx/modules_include.gen.py
index 8ca50b0877eef..2e9fd73421ed2 100644
--- a/libcxx/test/libcxx/modules_include.gen.py
+++ b/libcxx/test/libcxx/modules_include.gen.py
@@ -16,8 +16,9 @@
 sys.path.append(sys.argv[1])
 from libcxx.header_information import lit_header_restrictions, public_headers
 
+BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
+
 for header in public_headers:
-  BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
   print(f"""\
 //--- {header}.compile.pass.cpp
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
@@ -45,7 +46,7 @@
 #include <{header}>
 """)
 
-print(f"""
+print(f"""\
 //--- __std_clang_module.compile.pass.mm
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
@@ -68,10 +69,6 @@
 // TODO: Investigate this failure
 // UNSUPPORTED{BLOCKLIT}: LIBCXX-FREEBSD-FIXME
 
-// Lit seems to compile this twice: once with the default flags and once with with
-// the flags specified in the RUN directive. Guard the first compile from failing.
-#if __has_feature(modules)
 @import std;
-#endif
 
 """)
diff --git a/libcxx/utils/CMakeLists.txt b/libcxx/utils/CMakeLists.txt
index 0338432f344a0..ce4e289290dce 100644
--- a/libcxx/utils/CMakeLists.txt
+++ b/libcxx/utils/CMakeLists.txt
@@ -2,6 +2,10 @@ add_custom_target(libcxx-generate-feature-test-macros
     COMMAND "${Python3_EXECUTABLE}" "${LIBCXX_SOURCE_DIR}/utils/generate_feature_test_macro_components.py"
     COMMENT "Generate the <version> header and tests for feature test macros.")
 
+add_custom_target(libcxx-generate-std-clang-module-header
+  COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/generate_std_clang_module_header.py"
+  COMMENT "Generate the <__std_clang_module> header")
+
 add_custom_target(libcxx-generate-extended-grapheme-cluster-tables
     COMMAND
         "${Python3_EXECUTABLE}"
@@ -38,6 +42,7 @@ add_custom_target(libcxx-generate-iwyu-mapping
 
 add_custom_target(libcxx-generate-files
     DEPENDS libcxx-generate-feature-test-macros
+            libcxx-generate-std-clang-module-header
             libcxx-generate-extended-grapheme-cluster-tables
             libcxx-generate-extended-grapheme-cluster-tests
             libcxx-generate-escaped-output-table
diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py
new file mode 100644
index 0000000000000..bcf0c220b7c59
--- /dev/null
+++ b/libcxx/utils/generate_std_clang_module_header.py
@@ -0,0 +1,82 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+import operator
+import os.path
+
+import libcxx.header_information
+
+public_headers = libcxx.header_information.public_headers
+header_include_requirements = libcxx.header_information.header_include_requirements
+always_available_headers = frozenset(public_headers).difference(
+    *header_include_requirements.values()
+)
+
+libcxx_include_directory = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include"
+)
+with open(
+    os.path.join(libcxx_include_directory, "__std_clang_module"), "w"
+) as std_clang_module_header:
+    std_clang_module_header.write(
+        """\
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// WARNING, this entire header is generated by
+// utils/generate_std_clang_module_header.py
+// DO NOT MODIFY!
+
+// This header should not be directly included, it's exclusively to import all
+// of the libc++ public clang modules for the `std` clang module to export. In
+// other words, it's to facilitate `@import std;` in Objective-C++ and `import std`
+// in Swift to expose all of the libc++ interfaces. This is generally not
+// recommended, however there are some clients that need to import all of libc++
+// without knowing what "all" is.
+#if !__building_module(std)
+#  error "Do not include this header directly, include individual headers instead"
+#endif
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+"""
+    )
+    # Include the angle brackets in sorting so that <a.h> sorts before <a>
+    # like check-format wants.
+    for include in sorted([f"<{header}>" for header in always_available_headers]):
+        std_clang_module_header.write(f"#include {include}\n")
+
+    for requirements, headers in sorted(
+        header_include_requirements.items(), key=operator.itemgetter(0)
+    ):
+        std_clang_module_header.write("\n")
+        if len(requirements) == 1:
+            std_clang_module_header.write("#ifndef ")
+            std_clang_module_header.write(requirements[0])
+        else:
+            std_clang_module_header.write("#if")
+            for index, requirement in enumerate(requirements):
+                if index > 0:
+                    std_clang_module_header.write(" &&")
+                std_clang_module_header.write(f" !defined({requirement})")
+        std_clang_module_header.write("\n")
+
+        for include in sorted([f"<{header}>" for header in headers]):
+            std_clang_module_header.write(f"#  include {include}\n")
+
+        std_clang_module_header.write("#endif\n")
diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py
index f23a896180b31..3c04f01fcbe50 100644
--- a/libcxx/utils/libcxx/header_information.py
+++ b/libcxx/utils/libcxx/header_information.py
@@ -59,6 +59,52 @@
     "wctype.h": "// UNSUPPORTED: no-wide-characters",
 }
 
+header_include_requirements = {
+    ("_LIBCPP_HAS_NO_ATOMIC_HEADER",): (
+        # headers with #error directives
+        "atomic",
+        # transitive includers of the above headers
+        "stdatomic.h",
+    ),
+    ("_LIBCPP_HAS_NO_LOCALIZATION",): (
+        # headers with #error directives
+        "ios",
+        "locale.h",
+        # transitive includers of the above headers
+        "clocale",
+        "codecvt",
+        "experimental/regex",
+        "fstream",
+        "iomanip",
+        "iostream",
+        "istream",
+        "locale",
+        "ostream",
+        "regex",
+        "sstream",
+        "streambuf",
+        "strstream",
+    ),
+    ("_LIBCPP_HAS_NO_THREADS",): (
+        # headers with #error directives
+        "barrier",
+        "future",
+        "latch",
+        "semaphore",
+        "shared_mutex",
+        "stop_token",
+        "thread",
+    ),
+    ("_LIBCPP_HAS_NO_WIDE_CHARACTERS",): (
+        # headers with #error directives
+        "wchar.h",
+        "wctype.h",
+        # transitive includers of the above headers
+        "cwchar",
+        "cwctype",
+    ),
+}
+
 private_headers_still_public_in_modules = [
     "__assert",
     "__config",

From 0521244b24745ecbd098a8b0e02786f177dcb539 Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Wed, 16 Aug 2023 17:02:41 -0700
Subject: [PATCH 24/92] [libc++][Modules] Simplify the __std_clang_module
 header generation

Post review feedback on D157364. Don't section the __std_clang_module header by macro, put the headers in alphabetical order and repeat the macro guards. Restore header_information.header_restrictions.

Reviewed By: ldionne, #libc

Differential Revision: https://reviews.llvm.org/D158133

# Conflicts:
#	libcxx/utils/libcxx/header_information.py
---
 libcxx/include/__std_clang_module             | 118 ++++++++++++------
 .../utils/generate_std_clang_module_header.py |  34 ++---
 libcxx/utils/libcxx/header_information.py     |  86 ++++++-------
 3 files changed, 129 insertions(+), 109 deletions(-)

diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module
index 46f50e87515b5..4d02336d30b06 100644
--- a/libcxx/include/__std_clang_module
+++ b/libcxx/include/__std_clang_module
@@ -30,6 +30,12 @@
 #include <algorithm>
 #include <any>
 #include <array>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <atomic>
+#endif
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <barrier>
+#endif
 #include <bit>
 #include <bitset>
 #include <cassert>
@@ -43,7 +49,13 @@
 #include <cinttypes>
 #include <ciso646>
 #include <climits>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <clocale>
+#endif
 #include <cmath>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <codecvt>
+#endif
 #include <compare>
 #include <complex.h>
 #include <complex>
@@ -63,6 +75,12 @@
 #include <ctime>
 #include <ctype.h>
 #include <cuchar>
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwchar>
+#endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+#  include <cwctype>
+#endif
 #include <deque>
 #include <errno.h>
 #include <exception>
@@ -75,6 +93,9 @@
 #include <experimental/map>
 #include <experimental/memory_resource>
 #include <experimental/propagate_const>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <experimental/regex>
+#endif
 #include <experimental/set>
 #include <experimental/simd>
 #include <experimental/string>
@@ -88,14 +109,41 @@
 #include <float.h>
 #include <format>
 #include <forward_list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <fstream>
+#endif
 #include <functional>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <future>
+#endif
 #include <initializer_list>
 #include <inttypes.h>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iomanip>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ios>
+#endif
 #include <iosfwd>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <iostream>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <istream>
+#endif
 #include <iterator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <latch>
+#endif
 #include <limits.h>
 #include <limits>
 #include <list>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale.h>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <locale>
+#endif
 #include <map>
 #include <math.h>
 #include <mdspan>
@@ -106,28 +154,58 @@
 #include <numbers>
 #include <numeric>
 #include <optional>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <ostream>
+#endif
 #include <print>
 #include <queue>
 #include <random>
 #include <ranges>
 #include <ratio>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <regex>
+#endif
 #include <scoped_allocator>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <semaphore>
+#endif
 #include <set>
 #include <setjmp.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <shared_mutex>
+#endif
 #include <source_location>
 #include <span>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <sstream>
+#endif
 #include <stack>
+#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
+#  include <stdatomic.h>
+#endif
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdexcept>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <stop_token>
+#endif
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <streambuf>
+#endif
 #include <string.h>
 #include <string>
 #include <string_view>
+#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  include <strstream>
+#endif
 #include <system_error>
 #include <tgmath.h>
+#if !defined(_LIBCPP_HAS_NO_THREADS)
+#  include <thread>
+#endif
 #include <tuple>
 #include <type_traits>
 #include <typeindex>
@@ -140,43 +218,9 @@
 #include <variant>
 #include <vector>
 #include <version>
-
-#ifndef _LIBCPP_HAS_NO_ATOMIC_HEADER
-#  include <atomic>
-#  include <stdatomic.h>
-#endif
-
-#ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <clocale>
-#  include <codecvt>
-#  include <experimental/regex>
-#  include <fstream>
-#  include <iomanip>
-#  include <ios>
-#  include <iostream>
-#  include <istream>
-#  include <locale.h>
-#  include <locale>
-#  include <ostream>
-#  include <regex>
-#  include <sstream>
-#  include <streambuf>
-#  include <strstream>
-#endif
-
-#ifndef _LIBCPP_HAS_NO_THREADS
-#  include <barrier>
-#  include <future>
-#  include <latch>
-#  include <semaphore>
-#  include <shared_mutex>
-#  include <stop_token>
-#  include <thread>
-#endif
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  include <cwchar>
-#  include <cwctype>
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
 #  include <wchar.h>
+#endif
+#if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
 #  include <wctype.h>
 #endif
diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py
index bcf0c220b7c59..afdc9f653c2a2 100644
--- a/libcxx/utils/generate_std_clang_module_header.py
+++ b/libcxx/utils/generate_std_clang_module_header.py
@@ -11,11 +11,7 @@
 
 import libcxx.header_information
 
-public_headers = libcxx.header_information.public_headers
-header_include_requirements = libcxx.header_information.header_include_requirements
-always_available_headers = frozenset(public_headers).difference(
-    *header_include_requirements.values()
-)
+header_restrictions = libcxx.header_information.header_restrictions
 
 libcxx_include_directory = os.path.join(
     os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include"
@@ -58,25 +54,11 @@
     )
     # Include the angle brackets in sorting so that <a.h> sorts before <a>
     # like check-format wants.
-    for include in sorted([f"<{header}>" for header in always_available_headers]):
-        std_clang_module_header.write(f"#include {include}\n")
-
-    for requirements, headers in sorted(
-        header_include_requirements.items(), key=operator.itemgetter(0)
-    ):
-        std_clang_module_header.write("\n")
-        if len(requirements) == 1:
-            std_clang_module_header.write("#ifndef ")
-            std_clang_module_header.write(requirements[0])
-        else:
-            std_clang_module_header.write("#if")
-            for index, requirement in enumerate(requirements):
-                if index > 0:
-                    std_clang_module_header.write(" &&")
-                std_clang_module_header.write(f" !defined({requirement})")
-        std_clang_module_header.write("\n")
-
-        for include in sorted([f"<{header}>" for header in headers]):
+    for include, header in sorted([(f"<{header}>", header) for header in libcxx.header_information.public_headers]):
+        header_restriction = header_restrictions.get(header)
+        if header_restriction:
+            std_clang_module_header.write(f"#if {header_restriction}\n")
             std_clang_module_header.write(f"#  include {include}\n")
-
-        std_clang_module_header.write("#endif\n")
+            std_clang_module_header.write(f"#endif\n")
+        else:
+            std_clang_module_header.write(f"#include {include}\n")
diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py
index 3c04f01fcbe50..169638d5efc11 100644
--- a/libcxx/utils/libcxx/header_information.py
+++ b/libcxx/utils/libcxx/header_information.py
@@ -8,6 +8,46 @@
 
 import os, pathlib
 
+header_restrictions = {
+    # headers with #error directives
+    "atomic": "!defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)",
+    "stdatomic.h": "!defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)",
+
+    # headers with #error directives
+    "ios": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "locale.h": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    # transitive includers of the above headers
+    "clocale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "codecvt": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "experimental/regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "fstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "iomanip": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "iostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "istream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "locale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "ostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "sstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "streambuf": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+    "strstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)",
+
+    # headers with #error directives
+    "barrier": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "future": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "latch": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "stop_token": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "thread": "!defined(_LIBCPP_HAS_NO_THREADS)",
+
+    # headers with #error directives
+    "wchar.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    "wctype.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    # transitive includers of the above headers
+    "cwchar": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+    "cwctype": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)",
+}
+
 lit_header_restrictions = {
     "barrier": "// UNSUPPORTED: no-threads, c++03, c++11, c++14, c++17",
     "clocale": "// UNSUPPORTED: no-localization",
@@ -59,52 +99,6 @@
     "wctype.h": "// UNSUPPORTED: no-wide-characters",
 }
 
-header_include_requirements = {
-    ("_LIBCPP_HAS_NO_ATOMIC_HEADER",): (
-        # headers with #error directives
-        "atomic",
-        # transitive includers of the above headers
-        "stdatomic.h",
-    ),
-    ("_LIBCPP_HAS_NO_LOCALIZATION",): (
-        # headers with #error directives
-        "ios",
-        "locale.h",
-        # transitive includers of the above headers
-        "clocale",
-        "codecvt",
-        "experimental/regex",
-        "fstream",
-        "iomanip",
-        "iostream",
-        "istream",
-        "locale",
-        "ostream",
-        "regex",
-        "sstream",
-        "streambuf",
-        "strstream",
-    ),
-    ("_LIBCPP_HAS_NO_THREADS",): (
-        # headers with #error directives
-        "barrier",
-        "future",
-        "latch",
-        "semaphore",
-        "shared_mutex",
-        "stop_token",
-        "thread",
-    ),
-    ("_LIBCPP_HAS_NO_WIDE_CHARACTERS",): (
-        # headers with #error directives
-        "wchar.h",
-        "wctype.h",
-        # transitive includers of the above headers
-        "cwchar",
-        "cwctype",
-    ),
-}
-
 private_headers_still_public_in_modules = [
     "__assert",
     "__config",

From f63cdbcc0cdccf14ac699207be6226b40b1751f8 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Thu, 27 Jul 2023 15:24:15 -0700
Subject: [PATCH 25/92] [libc++][ranges] Fix `ranges::to` test.

- Make a test for an internal concept libc++-only;
- Make sure that `size` and `capacity` in a test container return the
  same type on all platforms.

(cherry picked from commit 8b9a98661b780a5b50d1d6a1f822d25e0c454382)
---
 .../std/ranges/range.utility/range.utility.conv/to.pass.cpp   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
index 75f55bc420d0e..03270f25fd92b 100644
--- a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
@@ -19,6 +19,7 @@
 #include <vector>
 #include "container.h"
 #include "test_iterators.h"
+#include "test_macros.h"
 #include "test_range.h"
 
 template <class Container, class Range, class... Args>
@@ -119,6 +120,7 @@ struct Fallback {
   constexpr void push_back(value_type) {}
   constexpr value_type* begin() { return &x; }
   constexpr value_type* end() { return &x; }
+  std::size_t size() const { return 0; }
 };
 
 struct CtrDirectOrFallback : Fallback {
@@ -180,7 +182,7 @@ struct Reservable : Fallback {
     reserve_called = true;
   }
 };
-static_assert(std::ranges::__reservable_container<Reservable<>>);
+LIBCPP_STATIC_ASSERT(std::ranges::__reservable_container<Reservable<>>);
 
 constexpr void test_constraints() {
   { // Case 1 -- construct directly from the range.

From 4114813bf2fe23f4be34633037230e7a2ec22701 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Thu, 27 Jul 2023 15:26:50 -0700
Subject: [PATCH 26/92] [libc++] Increase the constexpr steps limit on some
 `bitset` tests.

Prevent these tests from failing on some platforms (the number of
constexpr steps increased by https://reviews.llvm.org/D154860).

(cherry picked from commit 91876eab93a9f0ef29a339ed99bdb1c8ed1e85c6)
---
 .../utilities/template.bitset/bitset.members/op_or_eq.pass.cpp  | 2 ++
 .../template.bitset/bitset.members/right_shift_eq.pass.cpp      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
index 0d0d62432e01d..8f3c0959c622f 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/op_or_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
 // bitset<N>& operator|=(const bitset<N>& rhs); // constexpr since C++23
 
 #include <bitset>
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
index bf182523880d1..ef4b7fc60329d 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/right_shift_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
 // bitset<N>& operator<<=(size_t pos); // constexpr since C++23
 
 #include <bitset>

From 2641da8e1057d943b79bc4ea906ee6804ea9dace Mon Sep 17 00:00:00 2001
From: Ian Anderson <iana@apple.com>
Date: Wed, 23 Aug 2023 13:52:18 -0700
Subject: [PATCH 27/92] [libc++][Modules] locale fails to compile with clang
 modules when _LIBCPP_LOCALE__L_EXTENSIONS is undefined

When `__locale_dir/locale_base_api/locale_guard.h is` compiled independently, as it is when it's in its own clang module, it fails to compile due to `locale_t` being undefined. It needs to include `__locale` to get that, instead of just `clocale`.

Reviewed By: ldionne, #libc

Differential Revision: https://reviews.llvm.org/D158669

(cherry picked from commit 6021c78fe55e3c4d3e073710bfe279c6a28566d4)
---
 libcxx/include/__locale_dir/locale_base_api/locale_guard.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
index 0e2e91af7d190..5946ed698e0fd 100644
--- a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
+++ b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
 
 #include <__config>
+#include <__locale> // for locale_t
 #include <clocale>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)

From f05226d7e38c36efe029a0eb4201b0843f81b5e8 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 14 Aug 2023 13:24:53 +0800
Subject: [PATCH 28/92] [C++20] [Coroutines] Mark await_suspend as noinline if
 the awaiter is not empty

Close https://github.com/llvm/llvm-project/issues/56301
Close https://github.com/llvm/llvm-project/issues/64151

See the summary and the discussion of https://reviews.llvm.org/D157070
to get the full context.

As @rjmccall pointed out, the key point of the root cause is that
currently we didn't implement the semantics for '@llvm.coro.save' well
("after the await-ready returns false, the coroutine is considered to be
suspended ") well.
Since the semantics implies that we (the compiler) shouldn't write the
spills into the coroutine frame in the await_suspend. But now it is possible
due to some combinations of the optimizations so the semantics are
broken. And the inlining is the root optimization of such optimizations.
So in this patch, we tried to add the `noinline` attribute to the
await_suspend call.

Also as an optimization, we don't add the `noinline` attribute to the
await_suspend call if the awaiter is an empty class. This should be
correct since the programmers can't access the local variables in
await_suspend if the awaiter is empty. I think this is necessary for the
performance since it is pretty common.

Another potential optimization is:

    call @llvm.coro.await_suspend(ptr %awaiter, ptr %handle,
                                  ptr @awaitSuspendFn)

Then it is much easier to perform the safety analysis in the middle
end.
If it is safe to inline the call to awaitSuspend, we can replace it
in the CoroEarly pass. Otherwise we could replace it in the CoroSplit
pass.

Reviewed By: rjmccall

Differential Revision: https://reviews.llvm.org/D157833
---
 clang/docs/ReleaseNotes.rst                   |   4 +
 clang/lib/CodeGen/CGCall.cpp                  |  24 ++
 clang/lib/CodeGen/CGCoroutine.cpp             |  33 +++
 clang/lib/CodeGen/CodeGenFunction.h           |   5 +
 .../coro-awaiter-noinline-suspend.cpp         | 207 ++++++++++++++++++
 clang/test/CodeGenCoroutines/pr56301.cpp      |  85 +++++++
 6 files changed, 358 insertions(+)
 create mode 100644 clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp
 create mode 100644 clang/test/CodeGenCoroutines/pr56301.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5add59680fd76..bd66a2224eccb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -702,6 +702,10 @@ Bug Fixes in This Version
 - Fix a hang on valid C code passing a function type as an argument to
   ``typeof`` to form a function declaration.
   (`#64713 <https://github.com/llvm/llvm-project/issues/64713>_`)
+- Fixed an issue where accesses to the local variables of a coroutine during
+  ``await_suspend`` could be misoptimized, including accesses to the awaiter
+  object itself.
+  (`#56301 <https://github.com/llvm/llvm-project/issues/56301>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 6b8af9bf18c1f..0d1e9ad439b7d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5487,6 +5487,30 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
   }
 
+  // The await_suspend call performed by co_await is essentially asynchronous
+  // to the execution of the coroutine. Inlining it normally into an unsplit
+  // coroutine can cause miscompilation because the coroutine CFG misrepresents
+  // the true control flow of the program: things that happen in the
+  // await_suspend are not guaranteed to happen prior to the resumption of the
+  // coroutine, and things that happen after the resumption of the coroutine
+  // (including its exit and the potential deallocation of the coroutine frame)
+  // are not guaranteed to happen only after the end of await_suspend.
+  //
+  // The short-term solution to this problem is to mark the call as uninlinable.
+  // But we don't want to do this if the call is known to be trivial, which is
+  // very common.
+  //
+  // The long-term solution may introduce patterns like:
+  //
+  //  call @llvm.coro.await_suspend(ptr %awaiter, ptr %handle,
+  //                                ptr @awaitSuspendFn)
+  //
+  // Then it is much easier to perform the safety analysis in the middle end.
+  // If it is safe to inline the call to awaitSuspend, we can replace it in the
+  // CoroEarly pass. Otherwise we could replace it in the CoroSplit pass.
+  if (inSuspendBlock() && mayCoroHandleEscape())
+    Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
+
   // Disable inlining inside SEH __try blocks.
   if (isSEHTryScope()) {
     Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 8437cda79beb2..810ae7d51ec10 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -139,6 +139,36 @@ static bool memberCallExpressionCanThrow(const Expr *E) {
   return true;
 }
 
+/// Return true when the coroutine handle may escape from the await-suspend
+/// (`awaiter.await_suspend(std::coroutine_handle)` expression).
+/// Return false only when the coroutine wouldn't escape in the await-suspend
+/// for sure.
+///
+/// While it is always safe to return true, return falses can bring better
+/// performances.
+///
+/// See https://github.com/llvm/llvm-project/issues/56301 and
+/// https://reviews.llvm.org/D157070 for the example and the full discussion.
+///
+/// FIXME: It will be much better to perform such analysis in the middle end.
+/// See the comments in `CodeGenFunction::EmitCall` for example.
+static bool MayCoroHandleEscape(CoroutineSuspendExpr const &S) {
+  CXXRecordDecl *Awaiter =
+      S.getCommonExpr()->getType().getNonReferenceType()->getAsCXXRecordDecl();
+
+  // Return true conservatively if the awaiter type is not a record type.
+  if (!Awaiter)
+    return true;
+
+  // In case the awaiter type is empty, the suspend wouldn't leak the coroutine
+  // handle.
+  //
+  // TODO: We can improve this by looking into the implementation of
+  // await-suspend and see if the coroutine handle is passed to foreign
+  // functions.
+  return !Awaiter->field_empty();
+}
+
 // Emit suspend expression which roughly looks like:
 //
 //   auto && x = CommonExpr();
@@ -199,8 +229,11 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
   auto *SaveCall = Builder.CreateCall(CoroSave, {NullPtr});
 
   CGF.CurCoro.InSuspendBlock = true;
+  CGF.CurCoro.MayCoroHandleEscape = MayCoroHandleEscape(S);
   auto *SuspendRet = CGF.EmitScalarExpr(S.getSuspendExpr());
   CGF.CurCoro.InSuspendBlock = false;
+  CGF.CurCoro.MayCoroHandleEscape = false;
+
   if (SuspendRet != nullptr && SuspendRet->getType()->isIntegerTy(1)) {
     // Veto suspension if requested by bool returning await_suspend.
     BasicBlock *RealSuspendBlock =
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 8722fd4550e4a..28ec2b9700721 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -334,6 +334,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   struct CGCoroInfo {
     std::unique_ptr<CGCoroData> Data;
     bool InSuspendBlock = false;
+    bool MayCoroHandleEscape = false;
     CGCoroInfo();
     ~CGCoroInfo();
   };
@@ -347,6 +348,10 @@ class CodeGenFunction : public CodeGenTypeCache {
     return isCoroutine() && CurCoro.InSuspendBlock;
   }
 
+  bool mayCoroHandleEscape() const {
+    return isCoroutine() && CurCoro.MayCoroHandleEscape;
+  }
+
   /// CurGD - The GlobalDecl for the current function being compiled.
   GlobalDecl CurGD;
 
diff --git a/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp b/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp
new file mode 100644
index 0000000000000..f935e256d9db9
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-awaiter-noinline-suspend.cpp
@@ -0,0 +1,207 @@
+// Tests that we can mark await-suspend as noinline correctly.
+//
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \
+// RUN:     -disable-llvm-passes | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_void() noexcept {}
+
+    std::coroutine_handle<> continuation;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle);
+  ~Task();
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+struct StatefulAwaiter {
+    int value;
+    bool await_ready() const noexcept { return false; }
+    template <typename PromiseType>
+    void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+    void await_resume() noexcept {}
+};
+
+typedef std::suspend_always NoStateAwaiter;
+using AnotherStatefulAwaiter = StatefulAwaiter;
+
+template <class T>
+struct TemplatedAwaiter {
+    T value;
+    bool await_ready() const noexcept { return false; }
+    template <typename PromiseType>
+    void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+    void await_resume() noexcept {}
+};
+
+
+class Awaitable {};
+StatefulAwaiter operator co_await(Awaitable) {
+  return StatefulAwaiter{};
+}
+
+StatefulAwaiter GlobalAwaiter;
+class Awaitable2 {};
+StatefulAwaiter& operator co_await(Awaitable2) {
+  return GlobalAwaiter;
+}
+
+Task testing() {
+    co_await std::suspend_always{};
+    co_await StatefulAwaiter{};
+    co_await AnotherStatefulAwaiter{};
+    
+    // Test lvalue case.
+    StatefulAwaiter awaiter;
+    co_await awaiter;
+
+    // The explicit call to await_suspend is not considered suspended.
+    awaiter.await_suspend(std::coroutine_handle<void>::from_address(nullptr));
+
+    co_await TemplatedAwaiter<int>{};
+    TemplatedAwaiter<int> TemplatedAwaiterInstace;
+    co_await TemplatedAwaiterInstace;
+
+    co_await Awaitable{};
+    co_await Awaitable2{};
+}
+
+// CHECK-LABEL: @_Z7testingv
+
+// Check `co_await __promise__.initial_suspend();` Since it returns std::suspend_always,
+// which is an empty class, we shouldn't generate optimization blocker for it.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR:[0-9]+]]
+
+// Check the `co_await std::suspend_always{};` expression. We shouldn't emit the optimization
+// blocker for it since it is an empty class.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await StatefulAwaiter{};`. We need to emit the optimization blocker since
+// the awaiter is not empty.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR:[0-9]+]]
+
+// Check `co_await AnotherStatefulAwaiter{};` to make sure that we can handle TypedefTypes.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await awaiter;` to make sure we can handle lvalue cases.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `awaiter.await_suspend(...)` to make sure the explicit call the await_suspend won't be marked as noinline
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIvEEvSt16coroutine_handleIT_E{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await TemplatedAwaiter<int>{};` to make sure we can handle specialized template
+// type.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN16TemplatedAwaiterIiE13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await TemplatedAwaiterInstace;` to make sure we can handle the lvalue from
+// specialized template type.
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN16TemplatedAwaiterIiE13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await Awaitable{};` to make sure we can handle awaiter returned by
+// `operator co_await`;
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await Awaitable2{};` to make sure we can handle awaiter returned by
+// `operator co_await` which returns a reference;
+// CHECK: call token @llvm.coro.save
+// CHECK: call void @_ZN15StatefulAwaiter13await_suspendIN4Task12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NOINLINE_ATTR]]
+
+// Check `co_await __promise__.final_suspend();`. We don't emit an blocker here since it is
+// empty.
+// CHECK: call token @llvm.coro.save
+// CHECK: call ptr @_ZN4Task12promise_type12FinalAwaiter13await_suspendIS0_EESt16coroutine_handleIvES3_IT_E{{.*}}#[[NORMAL_ATTR]]
+
+struct AwaitTransformTask {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+
+    AwaitTransformTask get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_void() noexcept {}
+
+    template <typename Awaitable>
+    auto await_transform(Awaitable &&awaitable) {
+      return awaitable;
+    }
+
+    std::coroutine_handle<> continuation;
+  };
+
+  AwaitTransformTask(std::coroutine_handle<promise_type> handle);
+  ~AwaitTransformTask();
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+struct awaitableWithGetAwaiter {
+  bool await_ready() const noexcept { return false; }
+  template <typename PromiseType>
+  void await_suspend(std::coroutine_handle<PromiseType> h) noexcept {}
+  void await_resume() noexcept {}
+};
+
+AwaitTransformTask testingWithAwaitTransform() {
+  co_await awaitableWithGetAwaiter{};
+}
+
+// CHECK-LABEL: @_Z25testingWithAwaitTransformv
+
+// Init suspend
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// CHECK: call void @_ZNSt14suspend_always13await_suspendESt16coroutine_handleIvE{{.*}}#[[NORMAL_ATTR]]
+
+// Check `co_await awaitableWithGetAwaiter{};`.
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// Check call void @_ZN23awaitableWithGetAwaiter13await_suspendIN18AwaitTransformTask12promise_typeEEEvSt16coroutine_handleIT_E{{.*}}#[[NORMAL_ATTR]]
+
+// Final suspend
+// CHECK: call token @llvm.coro.save
+// CHECK-NOT: call void @llvm.coro.opt.blocker(
+// CHECK: call ptr @_ZN18AwaitTransformTask12promise_type12FinalAwaiter13await_suspendIS0_EESt16coroutine_handleIvES3_IT_E{{.*}}#[[NORMAL_ATTR]]
+
+// CHECK-NOT: attributes #[[NORMAL_ATTR]] = noinline
+// CHECK: attributes #[[NOINLINE_ATTR]] = {{.*}}noinline
diff --git a/clang/test/CodeGenCoroutines/pr56301.cpp b/clang/test/CodeGenCoroutines/pr56301.cpp
new file mode 100644
index 0000000000000..cd851c0b815db
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/pr56301.cpp
@@ -0,0 +1,85 @@
+// An end-to-end test to make sure things get processed correctly.
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s -O3 | \
+// RUN:     FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct SomeAwaitable {
+  // Resume the supplied handle once the awaitable becomes ready,
+  // returning a handle that should be resumed now for the sake of symmetric transfer.
+  // If the awaitable is already ready, return an empty handle without doing anything.
+  //
+  // Defined in another translation unit. Note that this may contain
+  // code that synchronizees with another thread.
+  std::coroutine_handle<> Register(std::coroutine_handle<>);
+};
+
+// Defined in another translation unit.
+void DidntSuspend();
+
+struct Awaiter {
+  SomeAwaitable&& awaitable;
+  bool suspended;
+
+  bool await_ready() { return false; }
+
+  std::coroutine_handle<> await_suspend(const std::coroutine_handle<> h) {
+    // Assume we will suspend unless proven otherwise below. We must do
+    // this *before* calling Register, since we may be destroyed by another
+    // thread asynchronously as soon as we have registered.
+    suspended = true;
+
+    // Attempt to hand off responsibility for resuming/destroying the coroutine.
+    const auto to_resume = awaitable.Register(h);
+
+    if (!to_resume) {
+      // The awaitable is already ready. In this case we know that Register didn't
+      // hand off responsibility for the coroutine. So record the fact that we didn't
+      // actually suspend, and tell the compiler to resume us inline.
+      suspended = false;
+      return h;
+    }
+
+    // Resume whatever Register wants us to resume.
+    return to_resume;
+  }
+
+  void await_resume() {
+    // If we didn't suspend, make note of that fact.
+    if (!suspended) {
+      DidntSuspend();
+    }
+  }
+};
+
+struct MyTask{
+  struct promise_type {
+    MyTask get_return_object() { return {}; }
+    std::suspend_never initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void unhandled_exception();
+
+    Awaiter await_transform(SomeAwaitable&& awaitable) {
+      return Awaiter{static_cast<SomeAwaitable&&>(awaitable)};
+    }
+  };
+};
+
+MyTask FooBar() {
+  co_await SomeAwaitable();
+}
+
+// CHECK-LABEL: @_Z6FooBarv
+// CHECK: %[[to_resume:.*]] = {{.*}}call ptr @_ZN13SomeAwaitable8RegisterESt16coroutine_handleIvE
+// CHECK-NEXT: %[[to_bool:.*]] = icmp eq ptr %[[to_resume]], null
+// CHECK-NEXT: br i1 %[[to_bool]], label %[[then:.*]], label %[[else:.*]]
+
+// CHECK: [[then]]:
+// We only access the coroutine frame conditionally as the sources did.
+// CHECK:   store i8 0,
+// CHECK-NEXT: br label %[[else]]
+
+// CHECK: [[else]]:
+// No more access to the coroutine frame until suspended.
+// CHECK-NOT: store
+// CHECK: }

From 6998ecd330f2b028bf4678edd4f53b5489c5e6df Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 24 Aug 2023 10:41:44 +0800
Subject: [PATCH 29/92] [NFC] [C++20] [Coroutines] Mention the side effect of a
 fix may bring regressions

The fix we sent for https://github.com/llvm/llvm-project/issues/56301
may bring performance regressions. But we didn't mention it in the
ReleaseNotes so that users may get confused. e.g,
https://github.com/llvm/llvm-project/issues/64933. So this patch
mentions the possible side effect and the potential solutions in
https://github.com/llvm/llvm-project/issues/64945 to avoid
misunderstandings.
---
 clang/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bd66a2224eccb..b15ff0d6d20a3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -706,6 +706,9 @@ Bug Fixes in This Version
   ``await_suspend`` could be misoptimized, including accesses to the awaiter
   object itself.
   (`#56301 <https://github.com/llvm/llvm-project/issues/56301>`_)
+  The current solution may bring performance regressions if the awaiters have
+  non-static data members. See
+  `#64945 <https://github.com/llvm/llvm-project/issues/64945>`_ for details.
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

From 51844c6ca9330066dd24a9072cde9fd919b28e32 Mon Sep 17 00:00:00 2001
From: Denis Revunov <revunov.denis@huawei-partners.com>
Date: Thu, 29 Jun 2023 20:38:50 +0300
Subject: [PATCH 30/92] [BOLT][Instrumentation] Fix indirect call profile in
 PIE

Because indirect call tables use static addresses for call sites, but pc
values recorded by runtime may be subject to ASLR in PIE, we couldn't
find indirect call descriptions by their runtime address in PIE. It
resulted in [unknown] entries in profile for all indirect calls. We need
to substract base address of .text from runtime addresses to get the
corresponding static addresses. Here we create a getter for base address
of .text and substract it's return value from recorded PC values. It
converts them to static addresses, which then may be used to find the
corresponding indirect call descriptions.

Reviewed By: rafauler

Differential Revision: https://reviews.llvm.org/D154121

(cherry picked from commit a86dd9ae60662cfe9f9fb709a33c71d6fec66dfb)
---
 bolt/lib/Rewrite/RewriteInstance.cpp |  5 +++--
 bolt/runtime/common.h                | 14 ++++++++++++++
 bolt/runtime/instr.cpp               |  9 ++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 424b10c62b256..fe8c134b8554e 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1844,8 +1844,9 @@ void RewriteInstance::adjustCommandLineOptions() {
     exit(1);
   }
 
-  if (opts::ReorderFunctions != ReorderFunctions::RT_NONE &&
-      !opts::HotText.getNumOccurrences()) {
+  if (opts::Instrument ||
+      (opts::ReorderFunctions != ReorderFunctions::RT_NONE &&
+       !opts::HotText.getNumOccurrences())) {
     opts::HotText = true;
   } else if (opts::HotText && !BC->HasRelocations) {
     errs() << "BOLT-WARNING: hot text is disabled in non-relocation mode\n";
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
index 5cb1298b8a5b7..9e6f1756c5707 100644
--- a/bolt/runtime/common.h
+++ b/bolt/runtime/common.h
@@ -165,6 +165,20 @@ int memcmp(const void *s1, const void *s2, size_t n) {
 // Anonymous namespace covering everything but our library entry point
 namespace {
 
+// Get the difference between runtime addrress of .text section and
+// static address in section header table. Can be extracted from arbitrary
+// pc value recorded at runtime to get the corresponding static address, which
+// in turn can be used to search for indirect call description. Needed because
+// indirect call descriptions are read-only non-relocatable data.
+uint64_t getTextBaseAddress() {
+  uint64_t DynAddr;
+  uint64_t StaticAddr;
+  __asm__ volatile("leaq __hot_end(%%rip), %0\n\t"
+                   "movabsq $__hot_end, %1\n\t"
+                   : "=r"(DynAddr), "=r"(StaticAddr));
+  return DynAddr - StaticAddr;
+}
+
 constexpr uint32_t BufSize = 10240;
 
 #define _STRINGIFY(x) #x
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 147cad023290c..96a43f685befa 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -215,6 +215,12 @@ class BumpPtrAllocator {
 /// __bolt_instr_setup, our initialization routine.
 BumpPtrAllocator *GlobalAlloc;
 
+// Base address which we substract from recorded PC values when searching for
+// indirect call description entries. Needed because indCall descriptions are
+// mapped read-only and contain static addresses. Initialized in
+// __bolt_instr_setup.
+uint64_t TextBaseAddress = 0;
+
 // Storage for GlobalAlloc which can be shared if not using
 // instrumentation-file-append-pid.
 void *GlobalMetadataStorage;
@@ -1389,7 +1395,7 @@ void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
   const IndCallDescription *CallsiteDesc =
       &Ctx->IndCallDescriptions[CallsiteID];
   const IndCallTargetDescription *TargetDesc =
-      Ctx->lookupIndCallTarget(Entry.Key);
+      Ctx->lookupIndCallTarget(Entry.Key - TextBaseAddress);
   if (!TargetDesc) {
     DEBUG(report("Failed to lookup indirect call target\n"));
     char LineBuf[BufSize];
@@ -1609,6 +1615,7 @@ extern "C" void __bolt_instr_indirect_tailcall();
 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
   __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
   __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
+  TextBaseAddress = getTextBaseAddress();
 
   const uint64_t CountersStart =
       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);

From 3670e6a5c8a5f6a7665572c9d90bab9192f713eb Mon Sep 17 00:00:00 2001
From: Denis Revunov <revunov.denis@huawei-partners.com>
Date: Fri, 23 Jun 2023 16:06:58 +0000
Subject: [PATCH 31/92] [BOLT][Instrumentation] Add test for append-pid option

Reviewed By: rafauler

Differential Revision: https://reviews.llvm.org/D154121

(cherry picked from commit dfc759929644ed1ea3224ab30e1086f7acc60da6)
---
 bolt/test/lit.cfg.py                          |   3 +
 .../test/runtime/instrumentation-indirect-2.c | 168 ++++++++++++++++++
 bolt/test/runtime/wait_file.sh                |  48 +++++
 3 files changed, 219 insertions(+)
 create mode 100644 bolt/test/runtime/instrumentation-indirect-2.c
 create mode 100644 bolt/test/runtime/wait_file.sh

diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index fe27af87f9106..3a6da210e01f0 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -72,6 +72,9 @@
 if config.gnu_ld:
     config.available_features.add("gnu_ld")
 
+if lit.util.which("fuser"):
+    config.available_features.add("fuser")
+
 llvm_config.use_default_substitutions()
 
 llvm_config.config.environment["CLANG"] = config.bolt_clang
diff --git a/bolt/test/runtime/instrumentation-indirect-2.c b/bolt/test/runtime/instrumentation-indirect-2.c
new file mode 100644
index 0000000000000..7d19db14b77f0
--- /dev/null
+++ b/bolt/test/runtime/instrumentation-indirect-2.c
@@ -0,0 +1,168 @@
+// Check that indirect call hash tables properly register multiple calls,
+// and that calls from different processes don't get mixed up when using
+// --instrumentation-file-append-pid.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+__attribute__((noinline)) void funcA(int pid) { printf("funcA %d\n", pid); }
+__attribute__((noinline)) void funcB(int pid) { printf("funcB %d\n", pid); }
+__attribute__((noinline)) void funcC(int pid) { printf("funcC %d\n", pid); }
+__attribute__((noinline)) void funcD(int pid) { printf("funcD %d\n", pid); }
+__attribute__((noinline)) void funcE(int pid) { printf("funcE %d\n", pid); }
+__attribute__((noinline)) void funcF(int pid) { printf("funcF %d\n", pid); }
+__attribute__((noinline)) void funcG(int pid) { printf("funcG %d\n", pid); }
+__attribute__((noinline)) void funcH(int pid) { printf("funcH %d\n", pid); }
+__attribute__((noinline)) void funcI(int pid) { printf("funcI %d\n", pid); }
+__attribute__((noinline)) void funcJ(int pid) { printf("funcJ %d\n", pid); }
+__attribute__((noinline)) void funcK(int pid) { printf("funcK %d\n", pid); }
+__attribute__((noinline)) void funcL(int pid) { printf("funcL %d\n", pid); }
+__attribute__((noinline)) void funcM(int pid) { printf("funcM %d\n", pid); }
+__attribute__((noinline)) void funcN(int pid) { printf("funcN %d\n", pid); }
+__attribute__((noinline)) void funcO(int pid) { printf("funcO %d\n", pid); }
+__attribute__((noinline)) void funcP(int pid) { printf("funcP %d\n", pid); }
+
+int main() {
+
+  void (*funcs[])(int) = {funcA, funcB, funcC, funcD, funcE, funcF,
+                          funcG, funcH, funcI, funcJ, funcK, funcL,
+                          funcM, funcN, funcO, funcP};
+  int i;
+
+  switch (fork()) {
+  case -1:
+    printf("Failed to fork!\n");
+    exit(-1);
+    break;
+  case 0:
+    i = 0;
+    break;
+  default:
+    i = 1;
+    break;
+  }
+  int pid = getpid();
+  for (; i < sizeof(funcs) / sizeof(void *); i += 2) {
+    funcs[i](pid);
+  }
+
+  return 0;
+}
+/*
+REQUIRES: system-linux,shell,fuser
+
+RUN: %clang %cflags %s -o %t.exe -Wl,-q -pie -fpie
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
+RUN:   --conservative-instrumentation -o %t.instrumented_conservative \
+RUN: --instrumentation-sleep-time=1 --instrumentation-no-counters-clear \
+RUN: --instrumentation-wait-forks
+
+# Instrumented program needs to finish returning zero
+# Both output and profile must contain all 16 functions
+RUN: %t.instrumented_conservative > %t.output
+# Wait for profile and output to be fully written
+RUN: bash %S/wait_file.sh %t.output
+RUN: bash %S/wait_file.sh %t.fdata
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+RUN: cat %t.fdata | FileCheck %s --check-prefix=CHECK-COMMON-PROF
+
+CHECK-OUTPUT-DAG: funcA
+CHECK-OUTPUT-DAG: funcB
+CHECK-OUTPUT-DAG: funcC
+CHECK-OUTPUT-DAG: funcD
+CHECK-OUTPUT-DAG: funcE
+CHECK-OUTPUT-DAG: funcF
+CHECK-OUTPUT-DAG: funcG
+CHECK-OUTPUT-DAG: funcH
+CHECK-OUTPUT-DAG: funcI
+CHECK-OUTPUT-DAG: funcJ
+CHECK-OUTPUT-DAG: funcK
+CHECK-OUTPUT-DAG: funcL
+CHECK-OUTPUT-DAG: funcM
+CHECK-OUTPUT-DAG: funcN
+CHECK-OUTPUT-DAG: funcO
+CHECK-OUTPUT-DAG: funcP
+
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t \
+RUN:   --instrumentation-file-append-pid \
+RUN:   -o %t.instrumented
+
+RUN: %t.instrumented > %t.output
+# Wait till output is fully written in case child outlives parent
+RUN: bash %S/wait_file.sh %t.output
+# Make sure all functions were called
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+
+RUN: child_pid=$(cat %t.output | grep funcA | awk '{print $2;}')
+RUN: par_pid=$(cat %t.output | grep funcB | awk '{print $2;}')
+
+RUN: bash %S/wait_file.sh %t.$child_pid.fdata
+RUN: bash %S/wait_file.sh %t.$par_pid.fdata
+
+RUN: mv %t.$child_pid.fdata %t.child.fdata
+RUN: mv %t.$par_pid.fdata %t.parent.fdata
+
+# Instrumented binary must produce two profiles with only local calls
+# recorded. Functions called only in child should not appear in parent's
+# process and vice versa.
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-CHILD
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-NOCHILD
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-PARENT
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-NOPARENT
+
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+
+CHECK-NOCHILD-NOT: funcB
+CHECK-NOCHILD-NOT: funcD
+CHECK-NOCHILD-NOT: funcF
+CHECK-NOCHILD-NOT: funcH
+CHECK-NOCHILD-NOT: funcJ
+CHECK-NOCHILD-NOT: funcL
+CHECK-NOCHILD-NOT: funcN
+CHECK-NOCHILD-NOT: funcP
+
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+CHECK-NOPARENT-NOT: funcA
+CHECK-NOPARENT-NOT: funcC
+CHECK-NOPARENT-NOT: funcE
+CHECK-NOPARENT-NOT: funcG
+CHECK-NOPARENT-NOT: funcI
+CHECK-NOPARENT-NOT: funcK
+CHECK-NOPARENT-NOT: funcM
+CHECK-NOPARENT-NOT: funcO
+
+ */
diff --git a/bolt/test/runtime/wait_file.sh b/bolt/test/runtime/wait_file.sh
new file mode 100644
index 0000000000000..42d4c5b29e795
--- /dev/null
+++ b/bolt/test/runtime/wait_file.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+check_file() {
+    local file="$1"
+    if [ -z "$file" ]; then
+        echo "No file passed!"
+        exit 1
+    fi
+    if [ ! -f "$file" ]; then
+        return 1
+    fi
+
+    fuser -s "$file"
+    local ret=$?
+    if [ $ret -eq 1 ]; then # noone has file open
+        return 0
+    fi
+    if [ $ret -eq 0 ]; then # file open by some processes
+        return 1
+    fi
+    if [ $ret -eq 127 ]; then
+        echo "fuser command not found!"
+        exit 1
+    fi
+
+    echo "Unexpected exit code $ret from fuser!"
+    exit 1
+}
+
+wait_file() {
+    local file="$1"
+    local max_sleep=10
+    check_file "$file"
+    local ret=$?
+    while [ $ret -ne 0 ] && [ $max_sleep -ne 0 ]; do
+        sleep 1
+        max_sleep=$((max_sleep - 1))
+        check_file $file
+        ret=$?
+    done
+    if [ $max_sleep -eq 0 ]; then
+        echo "The file does not exist or the test hung!"
+        exit 1
+    fi
+
+}
+file="$1"
+wait_file "$file"

From 77d7f7d769c63ade5c1697fffcd0a9e6d5f5261d Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 23 Aug 2023 15:45:39 -0500
Subject: [PATCH 32/92] [PowerPC] Exclude frexp(long double) on linux

PowerPC on linux currently don't have support for lowering long double for
frexp().  Removing the tests until implementation is provided.

Reviewed By: #libc, amyk, Mordante

Differential Revision: https://reviews.llvm.org/D158547

(cherry picked from commit 5adac8bebcf26841c1d87227c5043af83a9ef94b)
---
 .../libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
index a07260a34516f..31511064ce7ca 100644
--- a/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
+++ b/libcxx/test/libcxx/numerics/c.math/constexpr-cxx23-clang.pass.cpp
@@ -58,9 +58,15 @@ int main(int, char**) {
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0f, &DummyInt) == 0.0f);
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0, &DummyInt) == 0.0);
+//FIXME: currently linux powerpc does not support this expansion
+// since 0.0L lowers to ppcf128 and special handling is required.
+#if !defined(__LONG_DOUBLE_IBM128__)
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexp(0.0L, &DummyInt) == 0.0L);
+#endif
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexpf(0.0f, &DummyInt) == 0.0f);
+#if !defined(__LONG_DOUBLE_IBM128__)
   ASSERT_NOT_CONSTEXPR_CXX23(std::frexpl(0.0L, &DummyInt) == 0.0L);
+#endif
 
   ASSERT_NOT_CONSTEXPR_CXX23(std::ilogb(1.0f) == 0);
   ASSERT_NOT_CONSTEXPR_CXX23(std::ilogb(1.0) == 0);

From ef7421f3f74b44d78886d9e5e0891384251b20a0 Mon Sep 17 00:00:00 2001
From: David Tellenbach <dtellenbach@apple.com>
Date: Wed, 23 Aug 2023 14:46:31 -0700
Subject: [PATCH 33/92] [AArch64] Check opcode before trying to extract
 register from operand

When matching FNEG patterns for the MachineCombiner we need to check for
opcodes first, before trying to extract a register from an operand.
Otherwise handling of instructions with non-register operands causes the
compiler to crash.

Differential Revision: https://reviews.llvm.org/D158473

(cherry picked from commit 979e8ae4fce64546c65d24864eedd8165bc9787b)
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   4 +-
 .../emit_fneg_with_non_register_operand.mir   | 130 ++++++++++++++++++
 2 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bd03ffaafab10..30bd580ad86a7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5433,8 +5433,8 @@ static bool getFNEGPatterns(MachineInstr &Root,
   auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
     MachineOperand &MO = Root.getOperand(1);
     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
-    if (MI != nullptr && MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
-        (MI->getOpcode() == Opcode) &&
+    if (MI != nullptr && (MI->getOpcode() == Opcode) &&
+        MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
new file mode 100644
index 0000000000000..6fe094cc6cbb4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -run-pass=machine-combiner -o - %s | FileCheck %s
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64"
+
+  @c = global double 0.000000e+00, align 8
+
+  define void @emit_fneg_with_non_register_operand(double %c) {
+  entry:
+    %0 = load double, ptr @c, align 8
+    %1 = tail call double asm sideeffect "", "=w,0"(double %0)
+    %2 = load double, ptr @c, align 8
+    %3 = tail call double asm sideeffect "", "=w,0"(double %2)
+    %fneg = fneg double %1
+    %cmp = fcmp oeq double %3, %fneg
+    br i1 %cmp, label %if.then, label %if.end
+
+  if.then:                                          ; preds = %entry
+    tail call void @b(double noundef %1)
+    ret void
+
+  if.end:                                           ; preds = %entry
+    ret void
+  }
+
+  declare void @b(double noundef)
+
+...
+---
+name:            emit_fneg_with_non_register_operand
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: fpr64, preferred-register: '' }
+  - { id: 1, class: fpr64, preferred-register: '' }
+  - { id: 2, class: fpr64, preferred-register: '' }
+  - { id: 3, class: fpr64, preferred-register: '' }
+  - { id: 4, class: fpr64, preferred-register: '' }
+  - { id: 5, class: fpr64, preferred-register: '' }
+  - { id: 6, class: gpr64common, preferred-register: '' }
+  - { id: 7, class: fpr64, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     true
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: emit_fneg_with_non_register_operand
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c
+  ; CHECK-NEXT:   [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fpr64 = COPY %2
+  ; CHECK-NEXT:   [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
+  ; CHECK-NEXT:   [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2
+  ; CHECK-NEXT:   nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr
+  ; CHECK-NEXT:   Bcc 1, %bb.2, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   $d0 = COPY [[COPY]]
+  ; CHECK-NEXT:   TCRETURNdi @b, 0, csr_aarch64_aapcs, implicit $sp, implicit $d0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+
+    %6:gpr64common = LOADgot target-flags(aarch64-got) @c
+    %3:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3)
+    %0:fpr64 = COPY %2
+    %5:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3)
+    %7:fpr64 = FNEGDr %2
+    nofpexcept FCMPDrr %4, killed %7, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.1.if.then:
+    $d0 = COPY %0
+    TCRETURNdi @b, 0, csr_aarch64_aapcs, implicit $sp, implicit $d0
+
+  bb.2.if.end:
+    RET_ReallyLR
+
+...

From 0ec768e415b7468a66ab53fcbb78649a1d2474a8 Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Fri, 25 Aug 2023 19:53:20 +0200
Subject: [PATCH 34/92] [libc++] Fix GNU/Hurd build

GNU/Hurd does have clock_gettime, it just doesn't define _POSIX_TIMERS because its support for timers is not complete.

Reviewed By: #libc, Mordante

Differential Revision: https://reviews.llvm.org/D158584

(cherry picked from commit 1cfcc36812ff7857567f7c729c22ae0e2be0fb3a)
---
 libcxx/src/chrono.cpp                      | 2 +-
 libcxx/src/filesystem/filesystem_clock.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp
index 0990d8dc181c2..f1596132024c9 100644
--- a/libcxx/src/chrono.cpp
+++ b/libcxx/src/chrono.cpp
@@ -31,7 +31,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 
diff --git a/libcxx/src/filesystem/filesystem_clock.cpp b/libcxx/src/filesystem/filesystem_clock.cpp
index d00cdc6df3437..fbb19ac68df55 100644
--- a/libcxx/src/filesystem/filesystem_clock.cpp
+++ b/libcxx/src/filesystem/filesystem_clock.cpp
@@ -29,7 +29,7 @@
 # include <sys/time.h> // for gettimeofday and timeval
 #endif
 
-#if defined(__APPLE__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
+#if defined(__APPLE__) || defined (__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0)
 # define _LIBCPP_HAS_CLOCK_GETTIME
 #endif
 

From e91ad6b97fcbf74747126264bd4e07d6668f0f94 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Fri, 18 Aug 2023 21:09:37 +0200
Subject: [PATCH 35/92] [Driver] Add PIE support on Solaris

`clang` currently lacks PIE support on Solaris.  This patch fixes this,
also linking with `crtbeginS.o` and `crtendS.o` for `-pie` and `-shared`.

Tested on `amd64-pc-solaris2.11`, `sparcv9-sun-solaris2.11`, and
`x86_64-pc-linux-gnu`.

Differential Revision: https://reviews.llvm.org/D158206

(cherry picked from commit 62945bb811169ffac7cf22c64b6dd3a3ad8d38f0)
---
 clang/lib/Driver/ToolChains/Solaris.cpp       | 35 +++++++++++--
 .../sparc-sun-solaris2.11/4.8.2/crtbeginS.o   |  0
 .../4.8.2/sparcv9/crtbeginS.o                 |  0
 clang/test/Driver/solaris-ld.c                | 49 +++++++++++++++++++
 4 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbeginS.o

diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 86c789f5fcef5..de5a69e4ca3fd 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -47,11 +47,24 @@ void solaris::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                          Exec, CmdArgs, Inputs, Output));
 }
 
+static bool getPIE(const ArgList &Args, const ToolChain &TC) {
+  if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_static) ||
+      Args.hasArg(options::OPT_r))
+    return false;
+
+  Arg *A = Args.getLastArg(options::OPT_pie, options::OPT_no_pie,
+                           options::OPT_nopie);
+  if (!A)
+    return TC.isPIEDefault(Args);
+  return A->getOption().matches(options::OPT_pie);
+}
+
 void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
+  const bool IsPIE = getPIE(Args, getToolChain());
   ArgStringList CmdArgs;
 
   // Demangle C++ names in errors
@@ -62,6 +75,11 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("_start");
   }
 
+  if (IsPIE) {
+    CmdArgs.push_back("-z");
+    CmdArgs.push_back("type=pie");
+  }
+
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
@@ -113,8 +131,13 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       values_xpg = "values-xpg4.o";
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath(values_xpg)));
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
+
+    const char *crtbegin = nullptr;
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      crtbegin = "crtbeginS.o";
+    else
+      crtbegin = "crtbegin.o";
+    CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(crtbegin)));
     // Add crtfastmath.o if available and fast math is enabled.
     getToolChain().addFastMathRuntimeIfAvailable(Args, CmdArgs);
   }
@@ -171,8 +194,12 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
                    options::OPT_r)) {
-    CmdArgs.push_back(
-        Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
+    if (Args.hasArg(options::OPT_shared) || IsPIE)
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtendS.o")));
+    else
+      CmdArgs.push_back(
+          Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
     CmdArgs.push_back(
         Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
diff --git a/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbeginS.o b/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbeginS.o b/clang/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/solaris-ld.c b/clang/test/Driver/solaris-ld.c
index 2127ad5ded074..8d97a5a3695bd 100644
--- a/clang/test/Driver/solaris-ld.c
+++ b/clang/test/Driver/solaris-ld.c
@@ -106,6 +106,33 @@
 // CHECK-SPARC32-SHARED-NOT: "-lgcc"
 // CHECK-SPARC32-SHARED-NOT: "-lm"
 
+// Check the right ld flags are present with -pie.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-PIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -nopie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+
+// Check that -shared/-r/-static disable PIE.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -shared -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -r -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -static -pie \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOPIE %s
+
+// CHECK-PIE: "-z" "type=pie"
+// CHECK-NOPIE-NOT: "-z" "type=pie"
+
 // -r suppresses default -l and crt*.o, values-*.o like -nostdlib.
 // RUN: %clang -### %s --target=sparc-sun-solaris2.11 -r 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-RELOCATABLE
@@ -115,6 +142,28 @@
 // CHECK-RELOCATABLE-NOT: /crt{{[^.]+}}.o
 // CHECK-RELOCATABLE-NOT: /values-{{[^.]+}}.o
 
+// Check that crt{begin,end}S.o is linked with -shared/-pie.
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -shared \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -nopie \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -pie \
+// RUN:        --gcc-toolchain="" \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTS %s
+// CHECK-CRTS: crtbeginS.o
+// CHECK-CRTS: crtendS.o
+// CHECK-NOCRTS-NOT: crtbeginS.o
+// CHECK-NOCRTS-NOT: crtendS.o
+
 // Check that crtfastmath.o is linked with -ffast-math.
 
 // Check sparc-sun-solaris2.11, 32bit

From baae3c31aba93f548d3249536102c28847b81e1b Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Mon, 28 Aug 2023 12:13:07 +0200
Subject: [PATCH 36/92] Revert "[clang] Run test for concrete target"

This reverts commit e54f48384bb213f2c204c74d4e7e08a13904a9d6.
---
 clang/test/SemaCXX/template-64605.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
index 99ccbfdc27f1c..b13acbf2ae566 100644
--- a/clang/test/SemaCXX/template-64605.cpp
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
 
 // https://github.com/llvm/llvm-project/issues/64605
 

From 0638df00951ca454b97f9db546a7f85f32258aa8 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Mon, 28 Aug 2023 12:13:13 +0200
Subject: [PATCH 37/92] Revert "[clang] Set FP options in Sema when
 instantiating CompoundStmt"

This reverts commit f1d5ea362577a8a1b5fafe775cf82a449daa3b07.
---
 clang/lib/Sema/TreeTransform.h        |  4 ----
 clang/test/SemaCXX/template-64605.cpp | 23 -----------------------
 2 files changed, 27 deletions(-)
 delete mode 100644 clang/test/SemaCXX/template-64605.cpp

diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 097e81ea7d45a..10b3587885e39 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7478,10 +7478,6 @@ StmtResult
 TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
                                               bool IsStmtExpr) {
   Sema::CompoundScopeRAII CompoundScope(getSema());
-  Sema::FPFeaturesStateRAII FPSave(getSema());
-  if (S->hasStoredFPFeatures())
-    getSema().resetFPOptions(
-        S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
   const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
deleted file mode 100644
index b13acbf2ae566..0000000000000
--- a/clang/test/SemaCXX/template-64605.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
-
-// https://github.com/llvm/llvm-project/issues/64605
-
-#pragma STDC FENV_ACCESS ON
-template <typename>
-int b_64605() {
-  int x;
-  if ((float)0xFFFFFFFF != (float)0x100000000) {
-    x = 1;
-  }
-  return x;
-}
-int f() { return b_64605<void>(); }
-
-// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
-// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
-
-// CHECK:      FunctionDecl {{.*}} b_64605 'int ()' implicit_instantiation
-// CHECK-NEXT: TemplateArgument type 'void'
-
-// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
-// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295

From ed108ee79b3119a4a83bc22b16abe5ff5a550825 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 28 Aug 2023 13:37:33 -0700
Subject: [PATCH 38/92] [Driver,X86] Ignore -mfpmath= for assembler input

Some options are only claimed in AddX86TargetArgs/etc (called by
Clang::RenderTargetOptions).
For assembler input, `Add*TargetArgs` is not called. If an option is
unclaimed, it either leads to a -Wunused-command-line-argument warning
or an error (if `TargetSpecific` is set)
```
// clang '-###' --target=x86_64 -mfpmath=sse -c a.s
clang: error: unsupported option '-mfpmath=sse' for target 'x86_64'
```

For -mfpmath=, it's actually claimed by RenderFloatingPointOptions,
which should be moved to AddARMTargetArgs/AddX86TargetArgs later
(non-AArch32-non-x86 targets give a frontend error).
This change is localized and similar to D153691, for release/17.x
backporting.

Fix https://github.com/llvm/llvm-project/issues/65023

Reviewed By: thesamesam

Differential Revision: https://reviews.llvm.org/D159010

(cherry picked from commit 081afa3d04a4bc0d43c62b5b0e5a84f86a8a70ec)
---
 clang/lib/Driver/ToolChains/Arch/X86.cpp   | 8 +++++++-
 clang/lib/Driver/ToolChains/Arch/X86.h     | 2 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 2 +-
 clang/test/Driver/x86-mfpmath.c            | 5 +++++
 4 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/x86-mfpmath.c

diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index cf2bc63d74ada..4383b80041435 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -118,7 +118,13 @@ std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
 
 void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args,
-                               std::vector<StringRef> &Features) {
+                               std::vector<StringRef> &Features, bool ForAS) {
+  if (ForAS) {
+    // Some target-specific options are only handled in AddX86TargetArgs, which
+    // is not called by ClangAs::ConstructJob. Claim them here.
+    Args.claimAllArgs(options::OPT_mfpmath_EQ);
+  }
+
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
   if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.h b/clang/lib/Driver/ToolChains/Arch/X86.h
index e07387f3ece3d..762a1fa6f4d5f 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.h
+++ b/clang/lib/Driver/ToolChains/Arch/X86.h
@@ -26,7 +26,7 @@ std::string getX86TargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
 
 void getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                           const llvm::opt::ArgList &Args,
-                          std::vector<llvm::StringRef> &Features);
+                          std::vector<llvm::StringRef> &Features, bool ForAS);
 
 } // end namespace x86
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 0d6907b8e5c7a..8766d34eec538 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -528,7 +528,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     break;
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    x86::getX86TargetFeatures(D, Triple, Args, Features);
+    x86::getX86TargetFeatures(D, Triple, Args, Features, ForAS);
     break;
   case llvm::Triple::hexagon:
     hexagon::getHexagonTargetFeatures(D, Triple, Args, Features);
diff --git a/clang/test/Driver/x86-mfpmath.c b/clang/test/Driver/x86-mfpmath.c
new file mode 100644
index 0000000000000..7df594477a92c
--- /dev/null
+++ b/clang/test/Driver/x86-mfpmath.c
@@ -0,0 +1,5 @@
+// RUN: %clang -### -c --target=x86_64 -mfpmath=sse %s 2>&1 | FileCheck %s
+// CHECK: "-mfpmath" "sse"
+
+/// Don't warn for assembler input.
+// RUN: %clang -### -Werror -c --target=x86_64 -mfpmath=sse -x assembler %s 2>&1 | FileCheck /dev/null --implicit-check-not='"-mfpmath"'

From da76a22ae9ff984a2038940b17e0322219bce25f Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 25 Aug 2023 15:12:56 -0700
Subject: [PATCH 39/92] [llvm-rc] Continue to use Argv[0] to resolve executable
 path

In internal google builds, MainExecPath doesn't go to the directory with `clang`.
Fall back to using Argv0 if MainExecPath doesn't find any clangs.

Differential Revision: https://reviews.llvm.org/D158901

(cherry picked from commit e4eb8d97e8afcb879dc5cd0da7a937dbb26fbf12)
---
 llvm/tools/llvm-rc/llvm-rc.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index 4a77f4bd88cce..233b888546a81 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -142,20 +142,24 @@ ErrorOr<std::string> findClang(const char *Argv0, StringRef Triple) {
   if (MainExecPath.empty())
     MainExecPath = Argv0;
 
-  StringRef Parent = llvm::sys::path::parent_path(MainExecPath);
   ErrorOr<std::string> Path = std::error_code();
   std::string TargetClang = (Triple + "-clang").str();
   std::string VersionedClang = ("clang-" + Twine(LLVM_VERSION_MAJOR)).str();
-  if (!Parent.empty()) {
-    // First look for the tool with all potential names in the specific
-    // directory of Argv0, if known
-    for (const auto *Name :
-         {TargetClang.c_str(), VersionedClang.c_str(), "clang", "clang-cl"}) {
+  for (const auto *Name :
+       {TargetClang.c_str(), VersionedClang.c_str(), "clang", "clang-cl"}) {
+    for (const StringRef Parent :
+         {llvm::sys::path::parent_path(MainExecPath),
+          llvm::sys::path::parent_path(Argv0)}) {
+      // Look for various versions of "clang" first in the MainExecPath parent
+      // directory and then in the argv[0] parent directory.
+      // On Windows (but not Unix) argv[0] is overwritten with the eqiuvalent
+      // of MainExecPath by InitLLVM.
       Path = sys::findProgramByName(Name, Parent);
       if (Path)
         return Path;
     }
   }
+
   // If no parent directory known, or not found there, look everywhere in PATH
   for (const auto *Name : {"clang", "clang-cl"}) {
     Path = sys::findProgramByName(Name);

From 9fbdf9f8b1046a14323ddb95f7552c10f55b2c17 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 29 Aug 2023 09:10:57 -0400
Subject: [PATCH 40/92] Fix up release note; NFC

---
 clang/docs/ReleaseNotes.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b15ff0d6d20a3..d755626f795d5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -328,7 +328,9 @@ Attribute Changes in Clang
   the flag ``-Wunsafe-buffer-usage`` is enabled.
 - ``__declspec`` attributes can now be used together with the using keyword. Before
   the attributes on ``__declspec`` was ignored, while now it will be forwarded to the
-  point where the alias is used.
+  point where the alias is used. Note, some incorrect uses of ``__declspec`` on a
+  ``using`` declaration were being silently ignored and will now be appropriately
+  diagnosed as ignoring the attribute.
 - Introduced a new ``USR`` (unified symbol resolution) clause inside of the
   existing ``__attribute__((external_source_symbol))`` attribute. Clang's indexer
   uses the optional USR value when indexing Clang's AST. This value is expected

From a7346857934d6cecb21ddbd4100bbac1ce73cafc Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Fri, 25 Aug 2023 10:43:00 -0700
Subject: [PATCH 41/92] [llvm-exegesis] Use mmap2 when mmap is unavailable to
 fix riscv32 build

Some 32-bit architectures don't have mmap and define mmap2 instead.
E.g. on riscv32 we may get

```
| /mnt/b/yoe/master/build/tmp/work-shared/llvm-project-source-17.0.0-r0/git/llvm/tools/llvm-exegesis/lib/X86/Target.cpp:1116:19: error: use of undeclared identifier 'SYS_mmap'
|  1116 |   generateSyscall(SYS_mmap, MmapCode);
|       |                   ^
| /mnt/b/yoe/master/build/tmp/work-shared/llvm-project-source-17.0.0-r0/git/llvm/tools/llvm-exegesis/lib/X86/Target.cpp:1134:19: error: use of undeclared identifier 'SYS_mmap'
|  1134 |   generateSyscall(SYS_mmap, GeneratedCode);                                                                                                                                       |       |                   ^
| 1 warning and 2 errors generated.
```

Co-Authored-By: Fangrui Song <i@maskray.me>
Differential Revision: https://reviews.llvm.org/D158375

(cherry picked from commit 01a92f06f23585f15b3e83b7c378d0df2d91e06b)
---
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp           | 10 +++++-----
 llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index e690089fec565..b3d40800aef63 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -1083,11 +1083,11 @@ ExegesisX86Target::generateExitSyscall(unsigned ExitCode) const {
 #define MAP_FIXED_NOREPLACE MAP_FIXED
 #endif
 
-// 32 bit ARM doesn't have mmap and uses mmap2 instead. The only difference
-// between the two syscalls is that mmap2's offset parameter is in terms 4096
-// byte offsets rather than individual bytes, so for our purposes they are
-// effectively the same as all ofsets here are set to 0.
-#ifdef __arm__
+// Some 32-bit architectures don't have mmap and define mmap2 instead. The only
+// difference between the two syscalls is that mmap2's offset parameter is in
+// terms 4096 byte offsets rather than individual bytes, so for our purposes
+// they are effectively the same as all ofsets here are set to 0.
+#if defined(SYS_mmap2) && !defined(SYS_mmap)
 #define SYS_mmap SYS_mmap2
 #endif
 
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index aa5d525f24eb7..c001c693cc146 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -635,11 +635,11 @@ TEST_F(X86Core2TargetTest, GenerateExitSyscallTest) {
 #define MAP_FIXED_NOREPLACE MAP_FIXED
 #endif
 
-// 32 bit ARM doesn't have mmap and uses mmap2 instead. The only difference
-// between the two syscalls is that mmap2's offset parameter is in terms 4096
-// byte offsets rather than individual bytes, so for our purposes they are
-// effectively the same as all ofsets here are set to 0.
-#ifdef __arm__
+// Some 32-bit architectures don't have mmap and define mmap2 instead. The only
+// difference between the two syscalls is that mmap2's offset parameter is in
+// terms 4096 byte offsets rather than individual bytes, so for our purposes
+// they are effectively the same as all ofsets here are set to 0.
+#if defined(SYS_mmap2) && !defined(SYS_mmap)
 #define SYS_mmap SYS_mmap2
 #endif
 

From 5d2eda78bce64798fa0fb298f0726b2baa5c9e40 Mon Sep 17 00:00:00 2001
From: Erik Desjardins <erikdesjardinspublic@gmail.com>
Date: Thu, 24 Aug 2023 23:46:16 -0400
Subject: [PATCH 42/92] [ConstraintElim] fix crash with large constants in mul
 nsw

Another case of https://github.com/llvm/llvm-project/issues/55085.

The added test would trip an assertion due to calling `getSExtValue()` on a value that doesn't fit in int64_t.

Differential Revision: https://reviews.llvm.org/D158810

(cherry picked from commit 66ec5df3a7f33366455d50769e4e878544becea6)
---
 .../lib/Transforms/Scalar/ConstraintElimination.cpp |  2 +-
 .../ConstraintElimination/large-constant-ints.ll    | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 15628d32280d8..2b88dd08d88b6 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -424,7 +424,7 @@ static Decomposition decompose(Value *V,
       return MergeResults(Op0, Op1, IsSigned);
 
     ConstantInt *CI;
-    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI)))) {
+    if (match(V, m_NSWMul(m_Value(Op0), m_ConstantInt(CI))) && canUseSExt(CI)) {
       auto Result = decompose(Op0, Preconditions, IsSigned, DL);
       Result.mul(CI->getSExtValue());
       return Result;
diff --git a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
index fa8a4a60eac14..ae49ff9063fbe 100644
--- a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
+++ b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
@@ -282,6 +282,19 @@ else:
   ret i1 false
 }
 
+define i1 @mul_nsw_decomp(i128 %x) {
+    %val = mul nsw i128 %x, 9223372036854775808
+    %cmp = icmp sgt i128 %x, %val
+    br i1 %cmp, label %then, label %else
+
+then:
+    %cmp2 = icmp sgt i128 %x, 0
+    ret i1 %cmp2
+
+else:
+    ret i1 false
+}
+
 define i1 @add_nuw_decomp_recursive() {
 ; CHECK-LABEL: @add_nuw_decomp_recursive(
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 -9223372036854775808, 10

From acc6a14413f3edb6661c6d77c2edbd9fa804e458 Mon Sep 17 00:00:00 2001
From: Erik Desjardins <erikdesjardinspublic@gmail.com>
Date: Fri, 25 Aug 2023 15:37:05 -0400
Subject: [PATCH 43/92] [Tests][ConstraintElim] autogen newly-added case in
 large-constant-ints.ll (NFC)

I forgot to do this in 66ec5df3a7f33366455d50769e4e878544becea6 / https://reviews.llvm.org/D158810.

Since this is testing for an assertion failure, the test checks don't matter, but we might as well avoid unnecessary churn the next time someone modifies this test.

(cherry picked from commit df112cba034eefb86d0e92e18518f5e944d58c37)
---
 .../large-constant-ints.ll                    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
index ae49ff9063fbe..f08068420406d 100644
--- a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
+++ b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
@@ -283,16 +283,26 @@ else:
 }
 
 define i1 @mul_nsw_decomp(i128 %x) {
-    %val = mul nsw i128 %x, 9223372036854775808
-    %cmp = icmp sgt i128 %x, %val
-    br i1 %cmp, label %then, label %else
+; CHECK-LABEL: @mul_nsw_decomp(
+; CHECK-NEXT:    [[VAL:%.*]] = mul nsw i128 [[X:%.*]], 9223372036854775808
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i128 [[X]], [[VAL]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i128 [[X]], 0
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %val = mul nsw i128 %x, 9223372036854775808
+  %cmp = icmp sgt i128 %x, %val
+  br i1 %cmp, label %then, label %else
 
 then:
-    %cmp2 = icmp sgt i128 %x, 0
-    ret i1 %cmp2
+  %cmp2 = icmp sgt i128 %x, 0
+  ret i1 %cmp2
 
 else:
-    ret i1 false
+  ret i1 false
 }
 
 define i1 @add_nuw_decomp_recursive() {

From 94af834cb87ff9e04f9b075540dbf9b75e4a4cb3 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 28 Aug 2023 09:46:35 +0800
Subject: [PATCH 44/92] [X86][BF16] Lower FP_EXTEND for vector types under
 AVX512BF16

Fixes #64460

Reviewed By: RKSimon, skan

Differential Revision: https://reviews.llvm.org/D158950

(cherry picked from commit 6688701497ea8e562b769bdb154a40f4a1099abb)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  79 ++--
 llvm/lib/Target/X86/X86ISelLowering.h   |   2 -
 llvm/test/CodeGen/X86/bfloat.ll         | 590 +++++++++++++++++++-----
 3 files changed, 510 insertions(+), 161 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c14d51bb4fa57..ae9012055bbb9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1648,7 +1648,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_ROUND,           VT, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
     }
-    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
       setOperationAction(ISD::FP_EXTEND,          VT, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
     }
@@ -1656,9 +1656,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
-
-    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1975,8 +1972,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setF16Action(MVT::v32f16, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
-    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
@@ -2197,9 +2194,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
       setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
 
@@ -2249,9 +2246,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
@@ -14795,13 +14792,9 @@ static bool isShuffleFoldableLoad(SDValue V) {
 }
 
 template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
-  return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
-  return ::isSoftFP16(VT, Subtarget);
+static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
+  T EltVT = VT.getScalarType();
+  return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
@@ -14817,7 +14810,7 @@ static SDValue lowerShuffleAsElementInsertion(
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltBits = VT.getScalarSizeInBits();
 
-  if (isSoftFP16(EltVT, Subtarget))
+  if (isSoftF16(EltVT, Subtarget))
     return SDValue();
 
   int V2Index =
@@ -20374,7 +20367,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeVectorElementTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
                                           DAG.getBitcast(NVT, LHS),
@@ -21852,7 +21845,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (isSoftFP16(VT))
+  if (isSoftF16(VT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, true, Subtarget))
     return Op;
@@ -22357,7 +22350,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DstVT == MVT::f128)
     return SDValue();
 
-  if (isSoftFP16(DstVT))
+  if (isSoftF16(DstVT, Subtarget))
     return promoteXINT_TO_FP(Op, DAG);
   else if (isLegalConversion(SrcVT, false, Subtarget))
     return Op;
@@ -23314,7 +23307,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   SDValue Res;
-  if (isSoftFP16(SrcVT)) {
+  if (isSoftF16(SrcVT, Subtarget)) {
     MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
     if (IsStrict)
       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
@@ -23743,7 +23736,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
-  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
+  if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
     return SDValue();
 
   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -23888,6 +23881,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
                           !Subtarget.getTargetTriple().isOSDarwin()))
     return SDValue();
 
+  if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
+      (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
+    return Op;
+
   if (SVT == MVT::f16) {
     if (Subtarget.hasFP16())
       return Op;
@@ -23960,7 +23957,25 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   if (!SVT.isVector())
     return Op;
 
+  if (SVT.getVectorElementType() == MVT::bf16) {
+    // FIXME: Do we need to support strict FP?
+    assert(!IsStrict && "Strict FP doesn't support BF16");
+    if (VT.getVectorElementType() == MVT::f64) {
+      MVT TmpVT = VT.changeVectorElementType(MVT::f32);
+      return DAG.getNode(ISD::FP_EXTEND, DL, VT,
+                         DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
+    }
+    assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
+    MVT NVT = SVT.changeVectorElementType(MVT::i32);
+    In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
+    In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
+    In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
+    return DAG.getBitcast(VT, In);
+  }
+
   if (SVT.getVectorElementType() == MVT::f16) {
+    if (Subtarget.hasFP16() && isTypeLegal(SVT))
+      return Op;
     assert(Subtarget.hasF16C() && "Unexpected features!");
     if (SVT == MVT::v2f16)
       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
@@ -25676,7 +25691,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (isFP) {
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-    if (isSoftFP16(EltVT, Subtarget))
+    if (isSoftF16(EltVT, Subtarget))
       return SDValue();
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
@@ -26241,7 +26256,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC =
       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
 
-  if (isSoftFP16(Op0.getValueType()))
+  if (isSoftF16(Op0.getValueType(), Subtarget))
     return SDValue();
 
   // Handle f128 first, since one possible outcome is a normal integer
@@ -26434,7 +26449,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
-  if (isSoftFP16(VT)) {
+  if (isSoftF16(VT, Subtarget)) {
     MVT NVT = VT.changeTypeToInteger();
     return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
                                           DAG.getBitcast(NVT, Op1),
@@ -26506,7 +26521,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (Cond.getOpcode() == ISD::SETCC &&
-      !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
+      !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
@@ -27196,7 +27211,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // Bail out when we don't have native compare instructions.
   if (Cond.getOpcode() == ISD::SETCC &&
       Cond.getOperand(0).getValueType() != MVT::f128 &&
-      !isSoftFP16(Cond.getOperand(0).getValueType())) {
+      !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -34983,7 +34998,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SrcVT = Src.getValueType();
 
     SDValue Res;
-    if (isSoftFP16(SrcVT)) {
+    if (isSoftF16(SrcVT, Subtarget)) {
       EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
       if (IsStrict) {
         Res =
@@ -47383,7 +47398,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
+      VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
@@ -47700,7 +47715,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   }
 
   // Early exit check
-  if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
+  if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
     return SDValue();
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -54550,7 +54565,7 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
+  if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 250df82a30c2f..047d8f0210470 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1749,8 +1749,6 @@ namespace llvm {
 
     bool needsCmpXchgNb(Type *MemType) const;
 
-    template<typename T> bool isSoftFP16(T VT) const;
-
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index f680a39a482ec..dff4864537bfd 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16 | FileCheck %s --check-prefixes=CHECK,BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,FP16
 
 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-LABEL: add:
@@ -20,22 +21,22 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rdx, %rbx
-; BF16-NEXT:    movzwl (%rsi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    movzwl (%rdi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    movw %ax, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; F16-LABEL: add:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    movq %rdx, %rbx
+; F16-NEXT:    movzwl (%rsi), %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    movzwl (%rdi), %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm1
+; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    movw %ax, (%rbx)
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    retq
   %a = load bfloat, ptr %pa
   %b = load bfloat, ptr %pb
   %add = fadd bfloat %a, %b
@@ -58,19 +59,19 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rax
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    vmovd %xmm1, %ecx
-; BF16-NEXT:    shll $16, %ecx
-; BF16-NEXT:    vmovd %ecx, %xmm0
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    popq %rax
-; BF16-NEXT:    retq
+; F16-LABEL: add2:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rax
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    vmovd %xmm1, %ecx
+; F16-NEXT:    shll $16, %ecx
+; F16-NEXT:    vmovd %ecx, %xmm0
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm1
+; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    popq %rax
+; F16-NEXT:    retq
   %add = fadd bfloat %a, %b
   ret bfloat %add
 }
@@ -105,34 +106,34 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbp
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_double:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbp
-; BF16-NEXT:    pushq %r14
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rdx, %rbx
-; BF16-NEXT:    movq %rsi, %r14
-; BF16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %ebp
-; BF16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    shll $16, %ebp
-; BF16-NEXT:    vmovd %ebp, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    vmovsd %xmm0, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    popq %r14
-; BF16-NEXT:    popq %rbp
-; BF16-NEXT:    retq
+; F16-LABEL: add_double:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbp
+; F16-NEXT:    pushq %r14
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    movq %rdx, %rbx
+; F16-NEXT:    movq %rsi, %r14
+; F16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %ebp
+; F16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    shll $16, %ebp
+; F16-NEXT:    vmovd %ebp, %xmm1
+; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; F16-NEXT:    vmovsd %xmm0, (%rbx)
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    popq %r14
+; F16-NEXT:    popq %rbp
+; F16-NEXT:    retq
   %la = load double, ptr %pa
   %a = fptrunc double %la to bfloat
   %lb = load double, ptr %pb
@@ -169,30 +170,30 @@ define double @add_double2(double %da, double %db) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_double2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    subq $16, %rsp
-; BF16-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %ebx
-; BF16-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; BF16-NEXT:    # xmm0 = mem[0],zero
-; BF16-NEXT:    callq __truncdfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    shll $16, %ebx
-; BF16-NEXT:    vmovd %ebx, %xmm1
-; BF16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    addq $16, %rsp
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; F16-LABEL: add_double2:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    subq $16, %rsp
+; F16-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %ebx
+; F16-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    shll $16, %ebx
+; F16-NEXT:    vmovd %ebx, %xmm1
+; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; F16-NEXT:    addq $16, %rsp
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    retq
   %a = fptrunc double %da to bfloat
   %b = fptrunc double %db to bfloat
   %add = fadd bfloat %a, %b
@@ -215,19 +216,19 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_constant:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rbx
-; BF16-NEXT:    movq %rsi, %rbx
-; BF16-NEXT:    movzwl (%rdi), %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    movw %ax, (%rbx)
-; BF16-NEXT:    popq %rbx
-; BF16-NEXT:    retq
+; F16-LABEL: add_constant:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    movq %rsi, %rbx
+; F16-NEXT:    movzwl (%rdi), %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    movw %ax, (%rbx)
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    retq
   %a = load bfloat, ptr %pa
   %add = fadd bfloat %a, 1.0
   store bfloat %add, ptr %pc
@@ -246,16 +247,16 @@ define bfloat @add_constant2(bfloat %a) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: add_constant2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    pushq %rax
-; BF16-NEXT:    vmovd %xmm0, %eax
-; BF16-NEXT:    shll $16, %eax
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; BF16-NEXT:    callq __truncsfbf2@PLT
-; BF16-NEXT:    popq %rax
-; BF16-NEXT:    retq
+; F16-LABEL: add_constant2:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rax
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    shll $16, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16-NEXT:    callq __truncsfbf2@PLT
+; F16-NEXT:    popq %rax
+; F16-NEXT:    retq
   %add = fadd bfloat %a, 1.0
   ret bfloat %add
 }
@@ -540,6 +541,121 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
 ; BF16-NEXT:    popq %r15
 ; BF16-NEXT:    popq %rbp
 ; BF16-NEXT:    retq
+;
+; FP16-LABEL: addv:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rbp
+; FP16-NEXT:    pushq %r15
+; FP16-NEXT:    pushq %r14
+; FP16-NEXT:    pushq %r13
+; FP16-NEXT:    pushq %r12
+; FP16-NEXT:    pushq %rbx
+; FP16-NEXT:    subq $40, %rsp
+; FP16-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; FP16-NEXT:    vmovw %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm2
+; FP16-NEXT:    vmovw %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm2, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $7, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $7, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %ebp
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $6, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $6, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r14d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $5, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $5, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r15d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $4, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $4, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r12d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $3, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $3, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %r13d
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $2, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $2, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %ebx
+; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT:    vpextrw $1, %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; FP16-NEXT:    vpextrw $1, %xmm1, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm1
+; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; FP16-NEXT:    callq __truncsfbf2@PLT
+; FP16-NEXT:    vmovd %xmm0, %eax
+; FP16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; FP16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; FP16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $2, %ebx, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $3, %r13d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $4, %r12d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $5, %r15d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $6, %r14d, %xmm0, %xmm0
+; FP16-NEXT:    vpinsrw $7, %ebp, %xmm0, %xmm0
+; FP16-NEXT:    addq $40, %rsp
+; FP16-NEXT:    popq %rbx
+; FP16-NEXT:    popq %r12
+; FP16-NEXT:    popq %r13
+; FP16-NEXT:    popq %r14
+; FP16-NEXT:    popq %r15
+; FP16-NEXT:    popq %rbp
+; FP16-NEXT:    retq
   %add = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %add
 }
@@ -554,13 +670,13 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr62997:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vmovd %xmm1, %eax
-; BF16-NEXT:    vmovd %xmm0, %ecx
-; BF16-NEXT:    vmovd %ecx, %xmm0
-; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; BF16-NEXT:    retq
+; F16-LABEL: pr62997:
+; F16:       # %bb.0:
+; F16-NEXT:    vmovd %xmm1, %eax
+; F16-NEXT:    vmovd %xmm0, %ecx
+; F16-NEXT:    vmovd %ecx, %xmm0
+; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; F16-NEXT:    retq
   %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
   %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
   ret <2 x bfloat> %2
@@ -575,10 +691,10 @@ define <32 x bfloat> @pr63017() {
 ; SSE2-NEXT:    xorps %xmm3, %xmm3
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr63017:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BF16-NEXT:    retq
+; F16-LABEL: pr63017:
+; F16:       # %bb.0:
+; F16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; F16-NEXT:    retq
   ret <32 x bfloat> zeroinitializer
 }
 
@@ -1149,11 +1265,11 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; SSE2-NEXT:    popq %r14
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr63017_2:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; BF16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
-; BF16-NEXT:    retq
+; F16-LABEL: pr63017_2:
+; F16:       # %bb.0:
+; F16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; F16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
+; F16-NEXT:    retq
   %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
   ret <32 x bfloat> %1
 }
@@ -1173,14 +1289,234 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
-; BF16-LABEL: pr62997_3:
-; BF16:       # %bb.0:
-; BF16-NEXT:    vmovd %xmm1, %eax
-; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
-; BF16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; BF16-NEXT:    retq
+; F16-LABEL: pr62997_3:
+; F16:       # %bb.0:
+; F16-NEXT:    vmovd %xmm1, %eax
+; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; F16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; F16-NEXT:    retq
   %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
   ret <32 x bfloat> %3
 }
 
 declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
+
+define <4 x float> @pr64460_1(<4 x bfloat> %a) {
+; SSE2-LABEL: pr64460_1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    pextrw $3, %xmm0, %eax
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_1:
+; F16:       # %bb.0:
+; F16-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; F16-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; F16-NEXT:    retq
+  %b = fpext <4 x bfloat> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <8 x float> @pr64460_2(<8 x bfloat> %a) {
+; SSE2-LABEL: pr64460_2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    shrq $32, %rsi
+; SSE2-NEXT:    movl %edx, %edi
+; SSE2-NEXT:    andl $-65536, %edi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movl %edx, %edi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %rdx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    andl $-65536, %edx # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_2:
+; F16:       # %bb.0:
+; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; F16-NEXT:    vpslld $16, %ymm0, %ymm0
+; F16-NEXT:    retq
+  %b = fpext <8 x bfloat> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @pr64460_3(<16 x bfloat> %a) {
+; SSE2-LABEL: pr64460_3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
+; SSE2-NEXT:    movq %xmm1, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    shrq $32, %rdx
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    shrq $32, %r8
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    shrq $32, %r10
+; SSE2-NEXT:    movl %r9d, %r11d
+; SSE2-NEXT:    andl $-65536, %r11d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r11d, %xmm1
+; SSE2-NEXT:    movl %r9d, %r11d
+; SSE2-NEXT:    shll $16, %r11d
+; SSE2-NEXT:    movd %r11d, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %r9
+; SSE2-NEXT:    shll $16, %r9d
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    shll $16, %r10d
+; SSE2-NEXT:    movd %r10d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movl %edi, %r9d
+; SSE2-NEXT:    andl $-65536, %r9d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movl %edi, %r9d
+; SSE2-NEXT:    shll $16, %r9d
+; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    shrq $48, %rdi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shll $16, %r8d
+; SSE2-NEXT:    movd %r8d, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    movl %esi, %edi
+; SSE2-NEXT:    andl $-65536, %edi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edi, %xmm3
+; SSE2-NEXT:    movl %esi, %edi
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    shrq $48, %rsi
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm3
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    andl $-65536, %edx # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_3:
+; F16:       # %bb.0:
+; F16-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; F16-NEXT:    vpslld $16, %zmm0, %zmm0
+; F16-NEXT:    retq
+  %b = fpext <16 x bfloat> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @pr64460_4(<8 x bfloat> %a) {
+; SSE2-LABEL: pr64460_4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rax
+; SSE2-NEXT:    shrq $32, %rax
+; SSE2-NEXT:    movq %rdx, %rcx
+; SSE2-NEXT:    shrq $48, %rcx
+; SSE2-NEXT:    movq %rsi, %rdi
+; SSE2-NEXT:    shrq $32, %rdi
+; SSE2-NEXT:    movq %rsi, %r8
+; SSE2-NEXT:    shrq $48, %r8
+; SSE2-NEXT:    movl %esi, %r9d
+; SSE2-NEXT:    andl $-65536, %r9d # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %r9d, %xmm0
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm1
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shll $16, %r8d
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    cvtss2sd %xmm1, %xmm2
+; SSE2-NEXT:    shll $16, %edi
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    cvtss2sd %xmm1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movl %edx, %esi
+; SSE2-NEXT:    andl $-65536, %esi # imm = 0xFFFF0000
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    cvtss2sd %xmm2, %xmm3
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    cvtss2sd %xmm2, %xmm2
+; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    cvtss2sd %xmm3, %xmm4
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    cvtss2sd %xmm3, %xmm3
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: pr64460_4:
+; F16:       # %bb.0:
+; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; F16-NEXT:    vpslld $16, %ymm0, %ymm0
+; F16-NEXT:    vcvtps2pd %ymm0, %zmm0
+; F16-NEXT:    retq
+  %b = fpext <8 x bfloat> %a to <8 x double>
+  ret <8 x double> %b
+}

From d469d5ce19a8134148c35451558d5a81870ca871 Mon Sep 17 00:00:00 2001
From: Vassil Vassilev <v.g.vassilev@gmail.com>
Date: Tue, 29 Aug 2023 19:38:34 +0000
Subject: [PATCH 45/92] Reland "[clang-repl] Adapt to the recent dylib-related
 changes in ORC."

Original commit message:"

ORC splits into separate dylibs symbols coming from the process and symbols
materialized in the Jit. This patch adapts intent of the existing interface and
adds a regression test to make sure both Jit'd and compiled symbols can be found.

Differential revision: https://reviews.llvm.org/D159115
"

This patch disables the test statement on windows as it seems we might have a
bug in the way we model dllimports.

(cherry picked from commit 452cb7f20bc7b976eb6fec4ac9f2d902f4175c08)
---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 19 +++++++++++++------
 .../unittests/Interpreter/InterpreterTest.cpp | 16 +++++++++++++---
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 3f8d60630de41..2c4dfc9a611e0 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -92,12 +92,19 @@ llvm::Error IncrementalExecutor::runCtors() const {
 llvm::Expected<llvm::orc::ExecutorAddr>
 IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
                                       SymbolNameKind NameKind) const {
-  auto Sym = (NameKind == LinkerName) ? Jit->lookupLinkerMangled(Name)
-                                      : Jit->lookup(Name);
-
-  if (!Sym)
-    return Sym.takeError();
-  return Sym;
+  using namespace llvm::orc;
+  auto SO = makeJITDylibSearchOrder({&Jit->getMainJITDylib(),
+                                     Jit->getPlatformJITDylib().get(),
+                                     Jit->getProcessSymbolsJITDylib().get()});
+
+  ExecutionSession &ES = Jit->getExecutionSession();
+
+  auto SymOrErr =
+      ES.lookup(SO, (NameKind == LinkerName) ? ES.intern(Name)
+                                             : Jit->mangleAndIntern(Name));
+  if (auto Err = SymOrErr.takeError())
+    return std::move(Err);
+  return SymOrErr->getAddress();
 }
 
 } // end namespace clang
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index 338003cd9851c..abb8e6377aabd 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -232,10 +232,20 @@ TEST(IncrementalProcessing, FindMangledNameSymbol) {
   }
 
   std::string MangledName = MangleName(FD);
-  auto Addr = cantFail(Interp->getSymbolAddress(MangledName));
-  EXPECT_NE(0U, Addr.getValue());
+  auto Addr = Interp->getSymbolAddress(MangledName);
+  EXPECT_FALSE(!Addr);
+  EXPECT_NE(0U, Addr->getValue());
   GlobalDecl GD(FD);
-  EXPECT_EQ(Addr, cantFail(Interp->getSymbolAddress(GD)));
+  EXPECT_EQ(*Addr, cantFail(Interp->getSymbolAddress(GD)));
+  cantFail(
+      Interp->ParseAndExecute("extern \"C\" int printf(const char*,...);"));
+  Addr = Interp->getSymbolAddress("printf");
+  EXPECT_FALSE(!Addr);
+
+  // FIXME: Re-enable when we investigate the way we handle dllimports on Win.
+#ifndef _WIN32
+  EXPECT_EQ((unsigned long long)&printf, Addr->getValue());
+#endif // _WIN32
 }
 
 static void *AllocateObject(TypeDecl *TD, Interpreter &Interp) {

From 8c5c652ed71ba3948f52f707d8022eb5ec87d802 Mon Sep 17 00:00:00 2001
From: Takuya Shimizu <shimizu2486@gmail.com>
Date: Tue, 8 Aug 2023 21:10:25 +0900
Subject: [PATCH 46/92] [clang][ExprConstant] Fix crash on uninitialized base
 class subobject

This patch fixes the reported regression caused by D146358 through adding notes about an uninitialized base class when we diagnose uninitialized constructor.

This also changes the wording from the old one in order to make it clear that the uninitialized subobject is a base class and its constructor is not called.
Wording changes:
BEFORE: `subobject of type 'Base' is not initialized`
AFTER: `constructor of base class 'Base' is not called`

Fixes https://github.com/llvm/llvm-project/issues/63496

Reviewed By: aaron.ballman
Differential Revision: https://reviews.llvm.org/D153969
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 .../include/clang/Basic/DiagnosticASTKinds.td |  2 +
 clang/lib/AST/ExprConstant.cpp                | 13 ++++-
 .../constexpr-subobj-init-source-ranges.cpp   | 11 ++++
 .../constexpr-subobj-initialization.cpp       | 58 +++++++++++++++++++
 5 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Misc/constexpr-subobj-init-source-ranges.cpp
 create mode 100644 clang/test/SemaCXX/constexpr-subobj-initialization.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d755626f795d5..3d9774d5c7ff2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -472,6 +472,8 @@ Improvements to Clang's diagnostics
 - Clang now emits ``-Wconstant-logical-operand`` warning even when constant logical
   operand is on left side.
   (`#37919 <https://github.com/llvm/llvm-project/issues/37919>`_)
+- Clang contexpr evaluator now displays notes as well as an error when a constructor
+  of a base class is not called in the constructor of its derived class.
 
 Bug Fixes in This Version
 -------------------------
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 566cdc3406058..0794ed7ba6837 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -70,6 +70,8 @@ def note_consteval_address_accessible : Note<
   "is not a constant expression">;
 def note_constexpr_uninitialized : Note<
   "subobject %0 is not initialized">;
+def note_constexpr_uninitialized_base : Note<
+  "constructor of base class %0 is not called">;
 def note_constexpr_static_local : Note<
   "control flows through the definition of a %select{static|thread_local}0 variable">;
 def note_constexpr_subobject_declared_here : Note<
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 2f2f4eef852fd..f1bad0c7f7f22 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2418,9 +2418,16 @@ static bool CheckEvaluationResult(CheckEvaluationResultKind CERK,
     if (const CXXRecordDecl *CD = dyn_cast<CXXRecordDecl>(RD)) {
       unsigned BaseIndex = 0;
       for (const CXXBaseSpecifier &BS : CD->bases()) {
-        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(),
-                                   Value.getStructBase(BaseIndex), Kind,
-                                   /*SubobjectDecl=*/nullptr, CheckedTemps))
+        const APValue &BaseValue = Value.getStructBase(BaseIndex);
+        if (!BaseValue.hasValue()) {
+          SourceLocation TypeBeginLoc = BS.getBaseTypeLoc();
+          Info.FFDiag(TypeBeginLoc, diag::note_constexpr_uninitialized_base)
+              << BS.getType() << SourceRange(TypeBeginLoc, BS.getEndLoc());
+          return false;
+        }
+        if (!CheckEvaluationResult(CERK, Info, DiagLoc, BS.getType(), BaseValue,
+                                   Kind, /*SubobjectDecl=*/nullptr,
+                                   CheckedTemps))
           return false;
         ++BaseIndex;
       }
diff --git a/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp
new file mode 100644
index 0000000000000..342da2d886668
--- /dev/null
+++ b/clang/test/Misc/constexpr-subobj-init-source-ranges.cpp
@@ -0,0 +1,11 @@
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-print-source-range-info %s 2>&1 | FileCheck %s --strict-whitespace
+
+struct DelBase {
+  constexpr DelBase() = delete;
+};
+
+// CHECK:      :{[[@LINE+1]]:21-[[@LINE+1]]:28}
+struct Foo : public DelBase {
+  constexpr Foo() {};
+};
+constexpr Foo f;
diff --git a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
new file mode 100644
index 0000000000000..cd096a9270937
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace baseclass_uninit {
+struct DelBase {
+  constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
+};
+
+struct Foo : DelBase {  // expected-note 2{{constructor of base class 'DelBase' is not called}}
+  constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
+};
+constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
+struct Bar : Foo {
+  constexpr Bar() {};
+};
+constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
+
+struct Base {};
+struct A : Base { // expected-note {{constructor of base class 'Base' is not called}}
+  constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
+};
+
+constexpr A a; // expected-error {{must be initialized by a constant expression}}
+
+struct B : Base { // expected-note {{constructor of base class 'Base' is not called}}
+  constexpr B() : {} // expected-error {{expected class member or base class name}}
+};
+
+constexpr B b; // expected-error {{must be initialized by a constant expression}}
+} // namespace baseclass_uninit
+
+
+struct Foo {
+  constexpr Foo(); // expected-note 2{{declared here}}
+};
+
+constexpr Foo ff; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+
+struct Bar : protected Foo {
+  int i;
+  constexpr Bar() : i(12) {} // expected-note {{undefined constructor 'Foo' cannot be used in a constant expression}}
+};
+
+constexpr Bar bb; // expected-error {{must be initialized by a constant expression}} \
+                  // expected-note {{in call to 'Bar()'}}
+
+template <typename Ty>
+struct Baz {
+  constexpr Baz(); // expected-note {{declared here}}
+};
+
+struct Quux : Baz<Foo>, private Bar {
+  int i;
+  constexpr Quux() : i(12) {} // expected-note {{undefined constructor 'Baz' cannot be used in a constant expression}}
+};
+
+constexpr Quux qx; // expected-error {{must be initialized by a constant expression}} \
+                   // expected-note {{in call to 'Quux()'}}

From 2c9feb07f3370d2f5bab2a4731dd2500079e4ec8 Mon Sep 17 00:00:00 2001
From: Galen Elias <galenelias@gmail.com>
Date: Tue, 1 Aug 2023 13:42:04 -0700
Subject: [PATCH 47/92] [clang-format] Fix braced initializer with templated
 base class

Fixes #64134.

Differential Revision: https://reviews.llvm.org/D156705

(cherry picked from commit 400da115c58ae19445cfdc871a3f559f160fc5c6)
---
 clang/lib/Format/UnwrappedLineParser.cpp | 3 ++-
 clang/unittests/Format/FormatTest.cpp    | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 32619bc56f7a3..28e2954b5beba 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -581,7 +581,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
           ProbablyBracedList =
               ProbablyBracedList ||
               (NextTok->is(tok::l_brace) && LBraceStack.back().PrevTok &&
-               LBraceStack.back().PrevTok->is(tok::identifier));
+               LBraceStack.back().PrevTok->isOneOf(tok::identifier,
+                                                   tok::greater));
 
           ProbablyBracedList =
               ProbablyBracedList ||
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index e512a861dc4e3..ed0c5e64f4474 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -13458,6 +13458,8 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
   verifyFormat(
       "class A {\n"
       "  A() : a{} {}\n"
+      "  A() : Base<int>{} {}\n"
+      "  A() : Base<Foo<int>>{} {}\n"
       "  A(int b) : b(b) {}\n"
       "  A(int a, int b) : a(a), bs{{bs...}} { f(); }\n"
       "  int a, b;\n"

From 4dea033c9b9f6393551984061fb1cd036bbeca67 Mon Sep 17 00:00:00 2001
From: dingfei <fding@feysh.com>
Date: Thu, 17 Aug 2023 13:44:05 +0800
Subject: [PATCH 48/92] [clang] Update NumFunctionDeclBits for
 FunctionDeclBitfields

NumFunctionDeclBits is not updated when DeductionCandidateKind is
incremented.

Fixes https://github.com/llvm/llvm-project/issues/64171

Reviewed By: cor3ntin, balazske, aaron.ballman

Differential Revision: https://reviews.llvm.org/D158145

(cherry picked from commit 91c4b5550ecfbb7afe7275c341b73a6d3a1bbd78)
---
 clang/docs/ReleaseNotes.rst               |  3 ++
 clang/include/clang/AST/DeclBase.h        |  6 ++--
 clang/lib/Serialization/ASTWriterDecl.cpp |  4 +--
 clang/unittests/AST/ASTImporterTest.cpp   | 41 +++++++++++++++++++++++
 clang/unittests/AST/DeclTest.cpp          | 26 ++++++++++++++
 5 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3d9774d5c7ff2..180152417a490 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -833,6 +833,9 @@ Bug Fixes to C++ Support
 - Fix constraint checking of non-generic lambdas.
   (`#63181 <https://github.com/llvm/llvm-project/issues/63181>`_)
 
+- Update ``FunctionDeclBitfields.NumFunctionDeclBits``. This fixes:
+  (`#64171 <https://github.com/llvm/llvm-project/issues/64171>`_).
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 1b99709ca90d9..12137387b676a 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1702,7 +1702,7 @@ class DeclContext {
   };
 
   /// Number of non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = 30 };
+  enum { NumFunctionDeclBits = 31 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1714,12 +1714,12 @@ class DeclContext {
     /// For the bits in FunctionDeclBitfields.
     uint64_t : NumFunctionDeclBits;
 
-    /// 21 bits to fit in the remaining available space.
+    /// 20 bits to fit in the remaining available space.
     /// Note that this makes CXXConstructorDeclBitfields take
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 18;
+    uint64_t NumCtorInitializers : 17;
     uint64_t IsInheritingConstructor : 1;
 
     /// Whether this constructor has a trail-allocated explicit specifier.
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 59dbc36d24e8c..8dd78152bd687 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -580,7 +580,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 30,
+  static_assert(DeclContext::NumFunctionDeclBits == 31,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -1495,7 +1495,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
 }
 
 void ASTDeclWriter::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
-  static_assert(DeclContext::NumCXXConstructorDeclBits == 21,
+  static_assert(DeclContext::NumCXXConstructorDeclBits == 20,
                 "You need to update the serializer after you change the "
                 "CXXConstructorDeclBits");
 
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 3a1058f5e3fe9..9b81abda1d2e1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7711,6 +7711,47 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportConstructorUsingShadow) {
   CheckAST(ToTU, ToC);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase,
+       ImportFunctionDeclBitShouldNotOverwriteCtorDeclBits) {
+  Decl *From, *To;
+  std::tie(From, To) = getImportedDecl(
+      R"s(
+        struct A {
+          A() : m() {}
+          int m;
+        };
+
+        A foo() { A a; return a; }
+        A bar() { return {}; }
+      )s",
+      Lang_CXX17,
+      R"s(
+        struct A {
+          A() : m() {}
+          int m;
+        };
+        A baz() { return {}; }
+      )s",
+      Lang_CXX17, "A");
+
+  auto HasCtorInit =
+      hasAnyConstructorInitializer(cxxCtorInitializer(isMemberInitializer()));
+  auto ImpMoveCtor =
+      cxxConstructorDecl(isMoveConstructor(), isImplicit(), HasCtorInit);
+
+  auto *FromImpMoveCtor = FirstDeclMatcher<CXXConstructorDecl>().match(
+      From, ImpMoveCtor);
+  auto *ToImpMoveCtor = FirstDeclMatcher<CXXConstructorDecl>().match(
+      To, ImpMoveCtor);
+
+  EXPECT_TRUE(FromImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(FromImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+
+  EXPECT_TRUE(ToImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(ToImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+  EXPECT_TRUE(*ToImpMoveCtor->init_begin());
+}
+
 AST_MATCHER_P(UsingShadowDecl, hasIntroducerDecl, internal::Matcher<NamedDecl>,
               InnerMatcher) {
   return InnerMatcher.matches(*Node.getIntroducer(), Finder, Builder);
diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index 2ed2ed750941c..d2977b0cb55b6 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -353,6 +353,32 @@ TEST(Decl, FriendFunctionWithinClassInHeaderUnit) {
   EXPECT_TRUE(getFooValue->isInlined());
 }
 
+TEST(Decl, FunctionDeclBitsShouldNotOverlapWithCXXConstructorDeclBits) {
+  llvm::Annotations Code(R"(
+    struct A {
+      A() : m() {}
+      int m;
+    };
+
+    A f() { return A(); }
+    )");
+
+  auto AST = tooling::buildASTFromCodeWithArgs(Code.code(), {"-std=c++14"});
+  ASTContext &Ctx = AST->getASTContext();
+
+  auto HasCtorInit =
+      hasAnyConstructorInitializer(cxxCtorInitializer(isMemberInitializer()));
+  auto ImpMoveCtor =
+      cxxConstructorDecl(isMoveConstructor(), isImplicit(), HasCtorInit)
+          .bind("MoveCtor");
+
+  auto *ToImpMoveCtor =
+      selectFirst<CXXConstructorDecl>("MoveCtor", match(ImpMoveCtor, Ctx));
+
+  EXPECT_TRUE(ToImpMoveCtor->getNumCtorInitializers() == 1);
+  EXPECT_FALSE(ToImpMoveCtor->FriendConstraintRefersToEnclosingTemplate());
+}
+
 TEST(Decl, NoProtoFunctionDeclAttributes) {
   llvm::Annotations Code(R"(
     void f();

From cf16374055c1eb3367ada29e4e544f84bc588413 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 27 Aug 2023 10:36:30 -0700
Subject: [PATCH 49/92] [asan] Intercept atoll and strtoll on Windows

`_MSC_VER>=1800` (Visual Studio 2013) supports atoll/strtoll.
Remove the obsoleted workaround ASAN_INTERCEPT_ATOLL_AND_STRTOLL.

test/asan/TestCases/atoll_strict.c passes but
test/asan/TestCases/strtoll_strict.c doesn't.

(cherry picked from commit 8033231240f223dc7c718d1d27ece2dbcc8057c6)
---
 compiler-rt/lib/asan/asan_interceptors.cpp     | 6 +-----
 compiler-rt/lib/asan/asan_interceptors.h       | 2 --
 compiler-rt/lib/asan/asan_win_dll_thunk.cpp    | 2 ++
 compiler-rt/test/asan/TestCases/atoll_strict.c | 3 ---
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index b9b82564b3303..df879b1fbed12 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -639,7 +639,6 @@ INTERCEPTOR(long, atol, const char *nptr) {
   return result;
 }
 
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
 INTERCEPTOR(long long, strtoll, const char *nptr, char **endptr, int base) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, strtoll);
@@ -666,7 +665,6 @@ INTERCEPTOR(long long, atoll, const char *nptr) {
   ASAN_READ_STRING(ctx, nptr, (real_endptr - nptr) + 1);
   return result;
 }
-#endif  // ASAN_INTERCEPT_ATOLL_AND_STRTOLL
 
 #if ASAN_INTERCEPT___CXA_ATEXIT || ASAN_INTERCEPT_ATEXIT
 static void AtCxaAtexit(void *unused) {
@@ -751,11 +749,9 @@ void InitializeAsanInterceptors() {
 
   ASAN_INTERCEPT_FUNC(atoi);
   ASAN_INTERCEPT_FUNC(atol);
-  ASAN_INTERCEPT_FUNC(strtol);
-#if ASAN_INTERCEPT_ATOLL_AND_STRTOLL
   ASAN_INTERCEPT_FUNC(atoll);
+  ASAN_INTERCEPT_FUNC(strtol);
   ASAN_INTERCEPT_FUNC(strtoll);
-#endif
 
   // Intecept jump-related functions.
   ASAN_INTERCEPT_FUNC(longjmp);
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 268096fea5e7e..d00d05587b368 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -42,12 +42,10 @@ void InitializePlatformInterceptors();
 // Use macro to describe if specific function should be
 // intercepted on a given platform.
 #if !SANITIZER_WINDOWS
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 1
 # define ASAN_INTERCEPT__LONGJMP 1
 # define ASAN_INTERCEPT_INDEX 1
 # define ASAN_INTERCEPT_PTHREAD_CREATE 1
 #else
-# define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 0
 # define ASAN_INTERCEPT__LONGJMP 0
 # define ASAN_INTERCEPT_INDEX 0
 # define ASAN_INTERCEPT_PTHREAD_CREATE 0
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
index e3a90f18ed81a..0fa636bec0d00 100644
--- a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
@@ -65,6 +65,7 @@ INTERCEPT_WRAP_W_W(_expand_dbg)
 
 INTERCEPT_LIBRARY_FUNCTION(atoi);
 INTERCEPT_LIBRARY_FUNCTION(atol);
+INTERCEPT_LIBRARY_FUNCTION(atoll);
 INTERCEPT_LIBRARY_FUNCTION(frexp);
 INTERCEPT_LIBRARY_FUNCTION(longjmp);
 #if SANITIZER_INTERCEPT_MEMCHR
@@ -91,6 +92,7 @@ INTERCEPT_LIBRARY_FUNCTION(strspn);
 INTERCEPT_LIBRARY_FUNCTION(strstr);
 INTERCEPT_LIBRARY_FUNCTION(strtok);
 INTERCEPT_LIBRARY_FUNCTION(strtol);
+INTERCEPT_LIBRARY_FUNCTION(strtoll);
 INTERCEPT_LIBRARY_FUNCTION(wcslen);
 INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
 
diff --git a/compiler-rt/test/asan/TestCases/atoll_strict.c b/compiler-rt/test/asan/TestCases/atoll_strict.c
index 431ec6b4ba230..b204c97b17580 100644
--- a/compiler-rt/test/asan/TestCases/atoll_strict.c
+++ b/compiler-rt/test/asan/TestCases/atoll_strict.c
@@ -10,9 +10,6 @@
 // RUN: %env_asan_opts=strict_string_checks=false %run %t test3 2>&1
 // RUN: %env_asan_opts=strict_string_checks=true not %run %t test3 2>&1 | FileCheck %s --check-prefix=CHECK3
 
-// FIXME: Needs Windows interceptor.
-// XFAIL: target={{.*windows-(msvc.*|gnu)}}
-
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>

From dd230efe703f34678ce52280e50238abf908aaa1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 28 Aug 2023 00:49:49 -0700
Subject: [PATCH 50/92] [sanitizer] Intercept glibc 2.38 __isoc23_* functions

`strtol("0b1", 0, 0)` can be (pre-C23) 0 or (C23) 1.
`sscanf("0b10", "%i", &x)` is similar. glibc 2.38 introduced
`__isoc23_strtol` and `__isoc23_scanf` family functions for binary
compatibility.

When `_ISOC2X_SOURCE` is defined (implied by `_GNU_SOURCE`) or
`__STDC_VERSION__ > 201710L`, `__GLIBC_USE_ISOC2X` is defined to 1 and
these `__isoc23_*` symbols are used.

Add `__isoc23_` versions for the following interceptors:

* sanitizer_common_interceptors.inc implements strtoimax/strtoumax.
  Remove incorrect FIXME about https://github.com/google/sanitizers/issues/321
* asan_interceptors.cpp implements just strtol and strtoll. The default
  `replace_str` mode checks `nptr` is readable and `endptr` is writable.
  atoi reuses the existing strtol interceptor.
* msan_interceptors.cpp implements strtol family functions and their
  `_l` versions. Tested by lib/msan/tests/msan_test.cpp
* sanitizer_common_interceptors.inc implements scanf family functions.

The strtol family functions are spreaded, which is not great, but the
patch (intended for release/17.x) does not attempt to address the issue.

Add symbols to lib/sanitizer_common/symbolizer/scripts/global_symbols.txt to
support both glibc pre-2.38 and 2.38.

When build bots migrate to glibc 2.38+, we will lose test coverage for
non-isoc23 versions since the existing C++ unittests imply `_GNU_SOURCE`.
Add test/sanitizer_common/TestCases/{strtol.c,scanf.c}.
They catch msan false positive in the absence of the interceptors.

Fix https://github.com/llvm/llvm-project/issues/64388
Fix https://github.com/llvm/llvm-project/issues/64946

Link: https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00010.html
("The GNU C Library version 2.38 is now available")

Reviewed By: #sanitizers, vitalybuka, mgorny

Differential Revision: https://reviews.llvm.org/D158943

(cherry picked from commit ad7e2501000da2494860f06a306dfe8c08cc07c3)
---
 compiler-rt/lib/asan/asan_interceptors.cpp    | 50 +++++++------
 compiler-rt/lib/msan/msan_interceptors.cpp    | 37 ++++++++++
 .../sanitizer_common_interceptors.inc         | 73 +++++++++++++++----
 .../symbolizer/scripts/global_symbols.txt     |  7 ++
 .../test/sanitizer_common/TestCases/scanf.c   | 24 ++++++
 .../test/sanitizer_common/TestCases/strtol.c  | 61 ++++++++++++++++
 6 files changed, 214 insertions(+), 38 deletions(-)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/scanf.c
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/strtol.c

diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index df879b1fbed12..5158e99b75e5d 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -588,19 +588,34 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, uptr size) {
   return REAL(strncpy)(to, from, size);
 }
 
-INTERCEPTOR(long, strtol, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtol);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtol)(nptr, endptr, base);
-  }
+template <typename Fn>
+static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
+                                     char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
+  if (!flags()->replace_str)
+    return real(nptr, endptr, base);
   char *real_endptr;
-  long result = REAL(strtol)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
+  return res;
 }
 
+#  define INTERCEPTOR_STRTO_BASE(ret_type, func)                             \
+    INTERCEPTOR(ret_type, func, const char *nptr, char **endptr, int base) { \
+      void *ctx;                                                             \
+      ASAN_INTERCEPTOR_ENTER(ctx, func);                                     \
+      ENSURE_ASAN_INITED();                                                  \
+      return StrtolImpl(ctx, REAL(func), nptr, endptr, base);                \
+    }
+
+INTERCEPTOR_STRTO_BASE(long, strtol)
+INTERCEPTOR_STRTO_BASE(long long, strtoll)
+
+#  if SANITIZER_GLIBC
+INTERCEPTOR_STRTO_BASE(long, __isoc23_strtol)
+INTERCEPTOR_STRTO_BASE(long long, __isoc23_strtoll)
+#  endif
+
 INTERCEPTOR(int, atoi, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoi);
@@ -639,19 +654,6 @@ INTERCEPTOR(long, atol, const char *nptr) {
   return result;
 }
 
-INTERCEPTOR(long long, strtoll, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strtoll);
-  ENSURE_ASAN_INITED();
-  if (!flags()->replace_str) {
-    return REAL(strtoll)(nptr, endptr, base);
-  }
-  char *real_endptr;
-  long long result = REAL(strtoll)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return result;
-}
-
 INTERCEPTOR(long long, atoll, const char *nptr) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, atoll);
@@ -752,6 +754,10 @@ void InitializeAsanInterceptors() {
   ASAN_INTERCEPT_FUNC(atoll);
   ASAN_INTERCEPT_FUNC(strtol);
   ASAN_INTERCEPT_FUNC(strtoll);
+#  if SANITIZER_GLIBC
+  ASAN_INTERCEPT_FUNC(__isoc23_strtol);
+  ASAN_INTERCEPT_FUNC(__isoc23_strtoll);
+#  endif
 
   // Intecept jump-related functions.
   ASAN_INTERCEPT_FUNC(longjmp);
diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index f5e0d3cb9a673..9cb65d55372ff 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -464,6 +464,25 @@ INTERCEPTORS_STRTO_BASE(long long, wcstoll, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long, wcstoul, wchar_t)
 INTERCEPTORS_STRTO_BASE(unsigned long long, wcstoull, wchar_t)
 
+#if SANITIZER_GLIBC
+INTERCEPTORS_STRTO(double, __isoc23_strtod, char)
+INTERCEPTORS_STRTO(float, __isoc23_strtof, char)
+INTERCEPTORS_STRTO(long double, __isoc23_strtold, char)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_strtol, char)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_strtoll, char)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_strtoul, char)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_strtoull, char)
+INTERCEPTORS_STRTO_BASE(u64, __isoc23_strtouq, char)
+
+INTERCEPTORS_STRTO(double, __isoc23_wcstod, wchar_t)
+INTERCEPTORS_STRTO(float, __isoc23_wcstof, wchar_t)
+INTERCEPTORS_STRTO(long double, __isoc23_wcstold, wchar_t)
+INTERCEPTORS_STRTO_BASE(long, __isoc23_wcstol, wchar_t)
+INTERCEPTORS_STRTO_BASE(long long, __isoc23_wcstoll, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long, __isoc23_wcstoul, wchar_t)
+INTERCEPTORS_STRTO_BASE(unsigned long long, __isoc23_wcstoull, wchar_t)
+#endif
+
 #if SANITIZER_NETBSD
 #define INTERCEPT_STRTO(func) \
   INTERCEPT_FUNCTION(func); \
@@ -1748,6 +1767,24 @@ void InitializeInterceptors() {
   INTERCEPT_STRTO(wcstoul);
   INTERCEPT_STRTO(wcstoll);
   INTERCEPT_STRTO(wcstoull);
+#ifdef SANITIZER_GLIBC
+  INTERCEPT_STRTO(__isoc23_strtod);
+  INTERCEPT_STRTO(__isoc23_strtof);
+  INTERCEPT_STRTO(__isoc23_strtold);
+  INTERCEPT_STRTO(__isoc23_strtol);
+  INTERCEPT_STRTO(__isoc23_strtoul);
+  INTERCEPT_STRTO(__isoc23_strtoll);
+  INTERCEPT_STRTO(__isoc23_strtoull);
+  INTERCEPT_STRTO(__isoc23_strtouq);
+  INTERCEPT_STRTO(__isoc23_wcstod);
+  INTERCEPT_STRTO(__isoc23_wcstof);
+  INTERCEPT_STRTO(__isoc23_wcstold);
+  INTERCEPT_STRTO(__isoc23_wcstol);
+  INTERCEPT_STRTO(__isoc23_wcstoul);
+  INTERCEPT_STRTO(__isoc23_wcstoll);
+  INTERCEPT_STRTO(__isoc23_wcstoull);
+#endif
+
 #ifdef SANITIZER_NLDBL_VERSION
   INTERCEPT_FUNCTION_VER(vswprintf, SANITIZER_NLDBL_VERSION);
   INTERCEPT_FUNCTION_VER(swprintf, SANITIZER_NLDBL_VERSION);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 299561b3ad3a1..0e563fa12022a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1491,6 +1491,16 @@ VSCANF_INTERCEPTOR_IMPL(__isoc99_vsscanf, false, str, format, ap)
 
 INTERCEPTOR(int, __isoc99_vfscanf, void *stream, const char *format, va_list ap)
 VSCANF_INTERCEPTOR_IMPL(__isoc99_vfscanf, false, stream, format, ap)
+
+INTERCEPTOR(int, __isoc23_vscanf, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vscanf, false, format, ap)
+
+INTERCEPTOR(int, __isoc23_vsscanf, const char *str, const char *format,
+            va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vsscanf, false, str, format, ap)
+
+INTERCEPTOR(int, __isoc23_vfscanf, void *stream, const char *format, va_list ap)
+VSCANF_INTERCEPTOR_IMPL(__isoc23_vfscanf, false, stream, format, ap)
 #endif  // SANITIZER_INTERCEPT_ISOC99_SCANF
 
 INTERCEPTOR(int, scanf, const char *format, ...)
@@ -1511,6 +1521,15 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_fscanf, __isoc99_vfscanf, stream, format)
 
 INTERCEPTOR(int, __isoc99_sscanf, const char *str, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
+
+INTERCEPTOR(int, __isoc23_scanf, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_scanf, __isoc23_vscanf, format)
+
+INTERCEPTOR(int, __isoc23_fscanf, void *stream, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_fscanf, __isoc23_vfscanf, stream, format)
+
+INTERCEPTOR(int, __isoc23_sscanf, const char *str, const char *format, ...)
+FORMAT_INTERCEPTOR_IMPL(__isoc23_sscanf, __isoc23_vsscanf, str, format)
 #endif
 
 #endif
@@ -1534,7 +1553,13 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_sscanf, __isoc99_vsscanf, str, format)
   COMMON_INTERCEPT_FUNCTION(__isoc99_fscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vscanf);  \
   COMMON_INTERCEPT_FUNCTION(__isoc99_vsscanf); \
-  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf);
+  COMMON_INTERCEPT_FUNCTION(__isoc99_vfscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_scanf);   \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_sscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_fscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vscanf);  \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vsscanf); \
+  COMMON_INTERCEPT_FUNCTION(__isoc23_vfscanf);
 #else
 #define INIT_ISOC99_SCANF
 #endif
@@ -3539,30 +3564,26 @@ UNUSED static inline void StrtolFixAndCheck(void *ctx, const char *nptr,
                                  (real_endptr - nptr) + 1 : 0);
 }
 
-
 #if SANITIZER_INTERCEPT_STRTOIMAX
-INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
+template <typename Fn>
+static ALWAYS_INLINE auto StrtoimaxImpl(void *ctx, Fn real, const char *nptr,
+                                        char **endptr, int base)
+    -> decltype(real(nullptr, nullptr, 0)) {
   char *real_endptr;
-  INTMAX_T res = REAL(strtoimax)(nptr, &real_endptr, base);
+  auto res = real(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
   return res;
 }
 
+INTERCEPTOR(INTMAX_T, strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(strtoimax), nptr, endptr, base);
+}
 INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strtoumax, nptr, endptr, base);
-  // FIXME: under ASan the call below may write to freed memory and corrupt
-  // its metadata. See
-  // https://github.com/google/sanitizers/issues/321.
-  char *real_endptr;
-  UINTMAX_T res = REAL(strtoumax)(nptr, &real_endptr, base);
-  StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
-  return res;
+  return StrtoimaxImpl(ctx, REAL(strtoumax), nptr, endptr, base);
 }
 
 #define INIT_STRTOIMAX                  \
@@ -3572,6 +3593,25 @@ INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
 #define INIT_STRTOIMAX
 #endif
 
+#if SANITIZER_INTERCEPT_STRTOIMAX && SANITIZER_GLIBC
+INTERCEPTOR(INTMAX_T, __isoc23_strtoimax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoimax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoimax), nptr, endptr, base);
+}
+INTERCEPTOR(UINTMAX_T, __isoc23_strtoumax, const char *nptr, char **endptr, int base) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __isoc23_strtoumax, nptr, endptr, base);
+  return StrtoimaxImpl(ctx, REAL(__isoc23_strtoumax), nptr, endptr, base);
+}
+
+#  define INIT_STRTOIMAX_C23                       \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoimax); \
+    COMMON_INTERCEPT_FUNCTION(__isoc23_strtoumax);
+#else
+#  define INIT_STRTOIMAX_C23
+#endif
+
 #if SANITIZER_INTERCEPT_MBSTOWCS
 INTERCEPTOR(SIZE_T, mbstowcs, wchar_t *dest, const char *src, SIZE_T len) {
   void *ctx;
@@ -10304,6 +10344,7 @@ static void InitializeCommonInterceptors() {
   INIT_GETCWD;
   INIT_GET_CURRENT_DIR_NAME;
   INIT_STRTOIMAX;
+  INIT_STRTOIMAX_C23;
   INIT_MBSTOWCS;
   INIT_MBSNRTOWCS;
   INIT_WCSTOMBS;
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
index 509e3f19fe386..819cfca44b00b 100644
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt
@@ -34,6 +34,13 @@ __interceptor_pthread_setspecific w
 __interceptor_read w
 __interceptor_realpath w
 __isinf U
+__isoc23_sscanf U
+__isoc23_strtol U
+__isoc23_strtoll U
+__isoc23_strtoll_l U
+__isoc23_strtoull U
+__isoc23_strtoull_l U
+__isoc23_vsscanf U
 __isoc99_sscanf U
 __isoc99_vsscanf U
 __moddi3 U
diff --git a/compiler-rt/test/sanitizer_common/TestCases/scanf.c b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
new file mode 100644
index 0000000000000..a7f35c2af57ee
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
@@ -0,0 +1,24 @@
+// RUN: %clang -std=c17 %s -o %t && %run %t
+/// Test __isoc23_* for glibc 2.38+.
+// RUN: %clang -std=c23 %s -o %t && %run %t
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+int test_vsscanf(const char *buf, const char *fmt, ...) {
+  va_list ap;
+  va_start(ap, fmt);
+  int ret = vsscanf(buf, fmt, ap);
+  va_end(ap);
+  return ret;
+}
+
+int main(int argc, char **argv) {
+  int x, y;
+  assert(sscanf("42", "%d", &x) == 1);
+  assert(x == 42);
+  assert(test_vsscanf("42", "%d", &y) == 1);
+  assert(y == 42);
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/strtol.c b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
new file mode 100644
index 0000000000000..9947cdeacd8c3
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
@@ -0,0 +1,61 @@
+// RUN: %clang -std=c17 %s -o %t && %run %t
+/// Test __isoc23_* for glibc 2.38+.
+// RUN: %clang -std=c23 %s -o %t && %run %t
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#define TESTL(func)                                                            \
+  {                                                                            \
+    char *end;                                                                 \
+    long l = (long)func("42", &end, 0);                                        \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+#define TESTF(func)                                                            \
+  {                                                                            \
+    char *end;                                                                 \
+    long l = (long)func("42", &end);                                           \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+#define WTESTL(func)                                                           \
+  {                                                                            \
+    wchar_t *end;                                                              \
+    long l = (long)func(L"42", &end, 0);                                       \
+    assert(l == 42);                                                           \
+    assert(*end == L'\0');                                                     \
+  }
+
+#define WTESTF(func)                                                           \
+  {                                                                            \
+    wchar_t *end;                                                              \
+    long l = (long)func(L"42", &end);                                          \
+    assert(l == 42);                                                           \
+    assert(*end == '\0');                                                      \
+  }
+
+int main() {
+  TESTL(strtol);
+  TESTL(strtoll);
+  TESTL(strtoimax);
+  TESTL(strtoul);
+  TESTL(strtoull);
+  TESTL(strtoumax);
+  TESTF(strtof);
+  TESTF(strtod);
+  TESTF(strtold);
+
+  WTESTL(wcstol);
+  WTESTL(wcstoll);
+  WTESTL(wcstoul);
+  WTESTL(wcstoull);
+  WTESTF(wcstof);
+  WTESTF(wcstod);
+  WTESTF(wcstold);
+}

From c8abecdaca4e5d7defd9d3f6c388d64f5ee03e47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 28 Aug 2023 20:31:08 +0000
Subject: [PATCH 51/92] [compiler-rt] [test] Adjust an XFAIL for
 strtoll_strict.c for MinGW targets

8033231240f223dc7c718d1d27ece2dbcc8057c6 made this test pass
in MinGW environments, even if it still is failing in MSVC
environments.

(cherry picked from commit 277fc9475fb89c0b80d4237dbc8d698a55203c0d)
---
 compiler-rt/test/asan/TestCases/strtoll_strict.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/asan/TestCases/strtoll_strict.c b/compiler-rt/test/asan/TestCases/strtoll_strict.c
index 097412e3ab5c2..88e6651b6ed11 100644
--- a/compiler-rt/test/asan/TestCases/strtoll_strict.c
+++ b/compiler-rt/test/asan/TestCases/strtoll_strict.c
@@ -24,7 +24,7 @@
 
 // FIXME: Enable strtoll interceptor.
 // REQUIRES: shadow-scale-3
-// XFAIL: target={{.*windows-(msvc.*|gnu)}}
+// XFAIL: target={{.*windows-msvc.*}}
 
 #include <assert.h>
 #include <stdlib.h>

From b6cf21feeead70805ecd1231827777492034afea Mon Sep 17 00:00:00 2001
From: Brooks Davis <brooks@one-eyed-alien.net>
Date: Mon, 28 Aug 2023 21:22:29 -0700
Subject: [PATCH 52/92] [msan] Fix compilation on non-glibc

SANITIZER_GLIBC is always defined so should be tested with an if not an
ifdef.

Fixes: ad7e2501000d

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D159041

(cherry picked from commit 692344d87357ded619d216b265a9375f4326d8fb)
---
 compiler-rt/lib/msan/msan_interceptors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index 9cb65d55372ff..ba92bd14d319d 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -1767,7 +1767,7 @@ void InitializeInterceptors() {
   INTERCEPT_STRTO(wcstoul);
   INTERCEPT_STRTO(wcstoll);
   INTERCEPT_STRTO(wcstoull);
-#ifdef SANITIZER_GLIBC
+#if SANITIZER_GLIBC
   INTERCEPT_STRTO(__isoc23_strtod);
   INTERCEPT_STRTO(__isoc23_strtof);
   INTERCEPT_STRTO(__isoc23_strtold);

From 9afe6676833dc0a554463f82e8d5574a0987aa1e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 14 Aug 2023 08:59:59 -0700
Subject: [PATCH 53/92] Function multi-versioning: disable ifunc for ELF
 targets other than glibc/Android/FreeBSD

Generalize D127933 (Fuchsia special case) to other ELF targets. Ensure
that musl, NetBSD, OpenBSD, etc do not get ifunc codegen which is
unsupported in their rtld.

Link: https://discourse.llvm.org/t/does-ifunc-use-from-llvm-require-os-support/67628
Close: https://github.com/llvm/llvm-project/issues/64631
(cherry picked from commit 0c3a02b8c09bb408a74a638a263e51d67c92ca74)
---
 clang/include/clang/Basic/TargetInfo.h        |  4 +-
 clang/test/CodeGen/attr-target-mv-va-args.c   | 42 ++++++++++---------
 .../CodeGen/unique-internal-linkage-names.cpp |  4 +-
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 41ef47eb565b1..61be52149341f 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1414,7 +1414,9 @@ class TargetInfo : public TransferrableTargetInfo,
 
   /// Identify whether this target supports IFuncs.
   bool supportsIFunc() const {
-    return getTriple().isOSBinFormatELF() && !getTriple().isOSFuchsia();
+    return getTriple().isOSBinFormatELF() &&
+           ((getTriple().isOSLinux() && !getTriple().isMusl()) ||
+            getTriple().isOSFreeBSD());
   }
 
   // Validate the contents of the __builtin_cpu_supports(const char*)
diff --git a/clang/test/CodeGen/attr-target-mv-va-args.c b/clang/test/CodeGen/attr-target-mv-va-args.c
index e75796d7ee038..96821c610235b 100644
--- a/clang/test/CodeGen/attr-target-mv-va-args.c
+++ b/clang/test/CodeGen/attr-target-mv-va-args.c
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=LINUX
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=IFUNC-ELF
+// RUN: %clang_cc1 -triple x86_64-pc-freebsd -emit-llvm %s -o - | FileCheck %s --check-prefix=IFUNC-ELF
 // RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,WINDOWS
-// RUN: %clang_cc1 -triple x86_64-fuchsia -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,FUCHSIA
+// RUN: %clang_cc1 -triple x86_64-linux-musl -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,NO-IFUNC-ELF
+// RUN: %clang_cc1 -triple x86_64-fuchsia -emit-llvm %s -o - | FileCheck %s --check-prefixes=NO-IFUNC,NO-IFUNC-ELF
 int __attribute__((target("sse4.2"))) foo(int i, ...) { return 0; }
 int __attribute__((target("arch=sandybridge"))) foo(int i, ...);
 int __attribute__((target("arch=ivybridge"))) foo(int i, ...) {return 1;}
@@ -10,23 +12,23 @@ int bar(void) {
   return foo(1, 'a', 1.1) + foo(2, 2.2, "asdf");
 }
 
-// LINUX: @foo.ifunc = weak_odr ifunc i32 (i32, ...), ptr @foo.resolver
-// LINUX: define{{.*}} i32 @foo.sse4.2(i32 noundef %i, ...)
-// LINUX: ret i32 0
-// LINUX: define{{.*}} i32 @foo.arch_ivybridge(i32 noundef %i, ...)
-// LINUX: ret i32 1
-// LINUX: define{{.*}} i32 @foo(i32 noundef %i, ...)
-// LINUX: ret i32 2
-// LINUX: define{{.*}} i32 @bar()
-// LINUX: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
-// LINUX: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
+// IFUNC-ELF: @foo.ifunc = weak_odr ifunc i32 (i32, ...), ptr @foo.resolver
+// IFUNC-ELF: define{{.*}} i32 @foo.sse4.2(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 0
+// IFUNC-ELF: define{{.*}} i32 @foo.arch_ivybridge(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 1
+// IFUNC-ELF: define{{.*}} i32 @foo(i32 noundef %i, ...)
+// IFUNC-ELF: ret i32 2
+// IFUNC-ELF: define{{.*}} i32 @bar()
+// IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
+// IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
-// LINUX: define weak_odr ptr @foo.resolver() comdat
-// LINUX: ret ptr @foo.arch_sandybridge
-// LINUX: ret ptr @foo.arch_ivybridge
-// LINUX: ret ptr @foo.sse4.2
-// LINUX: ret ptr @foo
-// LINUX: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
+// IFUNC-ELF: define weak_odr ptr @foo.resolver() comdat
+// IFUNC-ELF: ret ptr @foo.arch_sandybridge
+// IFUNC-ELF: ret ptr @foo.arch_ivybridge
+// IFUNC-ELF: ret ptr @foo.sse4.2
+// IFUNC-ELF: ret ptr @foo
+// IFUNC-ELF: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
 
 // NO-IFUNC: define dso_local i32 @foo.sse4.2(i32 noundef %i, ...)
 // NO-IFUNC: ret i32 0
@@ -39,10 +41,10 @@ int bar(void) {
 // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
 // WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) comdat
-// FUCHSIA: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
+// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_sandybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_ivybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.sse4.2
 // NO-IFUNC: musttail call i32 (i32, ...) @foo
 // WINDOWS: declare dso_local i32 @foo.arch_sandybridge(i32 noundef, ...)
-// FUCHSIA: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
+// NO-IFUNC-ELF: declare i32 @foo.arch_sandybridge(i32 noundef, ...)
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index 731c2c143d7b7..65cf9db80b917 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -1,7 +1,7 @@
 // This test checks if internal linkage symbols get unique names with
 // -funique-internal-linkage-names option.
-// RUN: %clang_cc1 -triple x86_64 -x c++ -S -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
-// RUN: %clang_cc1 -triple x86_64 -x c++  -S -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -S -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++  -S -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
 
 static int glob;
 static int foo() {

From 993681f594dcad4ea279b32bcc0a2acafcd7d7c8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 17 Aug 2023 10:36:12 +0100
Subject: [PATCH 54/92] [RISCV] Add test case showing vmerge fold miscompile
 with tail policy

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D158160

(cherry picked from commit 6e532f94eb0e2c9c93a3d75b4cf53bf12ab9f518)
---
 .../CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 7620ba5310720..eb1dbdfc2db56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1065,3 +1065,15 @@ define <vscale x 2 x i32> @vmerge_larger_vl_poison_passthru(<vscale x 2 x i32> %
   ret <vscale x 2 x i32> %b
 }
 
+; FIXME: The vadd's new policy should be tail undisturbed since the false op of
+; the vmerge moves from the the body to the tail, and we need to preserve it.
+define <vscale x 2 x i32> @vmerge_larger_vl_false_becomes_tail(<vscale x 2 x i32> %false, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: vmerge_larger_vl_false_becomes_tail:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, i64 2)
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %false, <vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i64 3)
+  ret <vscale x 2 x i32> %b
+}

From 6f1974e04c3c223a812c6181c9d48ea85a472293 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 17 Aug 2023 10:49:49 +0100
Subject: [PATCH 55/92] [RISCV] Don't relax policy to ta when vmerge's VL
 shrinks during folding

When folding a vmerge into its operands, if the resulting VL is smaller than
what the vmerge had originally then what was previously in its body then gets
moved to the tail. In that case, we can't relax the tail policy to agnostic
when the merge operand is undefined, since we need to preserve these elements
past the new VL.

Fixes https://github.com/llvm/llvm-project/issues/64754

Reviewed By: craig.topper, reames

Differential Revision: https://reviews.llvm.org/D158161

(cherry picked from commit 007b41b3939832b6938bb1ba91e9febebf93d3b8)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp         | 13 ++++++++++++-
 .../test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll |  8 ++++----
 .../CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll   |  6 +++---
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cafce628cf6a2..aa20409da4e2b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3414,6 +3414,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
 
   // Because N and True must have the same merge operand (or True's operand is
   // implicit_def), the "effective" body is the minimum of their VLs.
+  SDValue OrigVL = VL;
   VL = GetMinVL(TrueVL, VL);
   if (!VL)
     return false;
@@ -3461,7 +3462,17 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
          "Expected instructions with mask have a tied dest.");
 #endif
 
-  uint64_t Policy = isImplicitDef(Merge) ? RISCVII::TAIL_AGNOSTIC : /*TUMU*/ 0;
+  // Use a tumu policy, relaxing it to tail agnostic provided that the merge
+  // operand is undefined.
+  //
+  // However, if the VL became smaller than what the vmerge had originally, then
+  // elements past VL that were previously in the vmerge's body will have moved
+  // to the tail. In that case we always need to use tail undisturbed to
+  // preserve them.
+  bool MergeVLShrunk = VL != OrigVL;
+  uint64_t Policy = (isImplicitDef(Merge) && !MergeVLShrunk)
+                        ? RISCVII::TAIL_AGNOSTIC
+                        : /*TUMU*/ 0;
   SDValue PolicyOp =
     CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT());
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index e74daee7cdddd..35d9b27c75f7d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -28,7 +28,7 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV32-NEXT:    vle32.v v8, (a0), v0.t
 ; RV32-NEXT:    vse32.v v8, (a3)
 ; RV32-NEXT:    ret
@@ -58,7 +58,7 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV64-NEXT:    vle32.v v8, (a0), v0.t
 ; RV64-NEXT:    vse32.v v8, (a3)
 ; RV64-NEXT:    ret
@@ -239,7 +239,7 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV32-NEXT:    vle32.v v8, (a0), v0.t
 ; RV32-NEXT:    vse32.v v8, (a3)
 ; RV32-NEXT:    ret
@@ -269,7 +269,7 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, mu
 ; RV64-NEXT:    vle32.v v8, (a0), v0.t
 ; RV64-NEXT:    vse32.v v8, (a3)
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index eb1dbdfc2db56..3c6515595b642 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1065,12 +1065,12 @@ define <vscale x 2 x i32> @vmerge_larger_vl_poison_passthru(<vscale x 2 x i32> %
   ret <vscale x 2 x i32> %b
 }
 
-; FIXME: The vadd's new policy should be tail undisturbed since the false op of
-; the vmerge moves from the the body to the tail, and we need to preserve it.
+; The vadd's new policy should be tail undisturbed since the false op of the
+; vmerge moves from the the body to the tail, and we need to preserve it.
 define <vscale x 2 x i32> @vmerge_larger_vl_false_becomes_tail(<vscale x 2 x i32> %false, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m) {
 ; CHECK-LABEL: vmerge_larger_vl_false_becomes_tail:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, mu
 ; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, i64 2)

From 1440745b0a641205a6db6e99e68723c52d1c582b Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 17 Aug 2023 01:00:02 -0700
Subject: [PATCH 56/92] [clang-format] Exclude kw_decltype in RemoveParentheses

From https://en.cppreference.com/w/cpp/language/decltype:
Note that if the name of an object is parenthesized, it is treated as an
ordinary lvalue expression, thus decltype(x) and decltype((x)) are often
different types.

Fixes #64786.

Differential Revision: https://reviews.llvm.org/D158155

(cherry picked from commit e3a79503a30f8c9d8fba79f3e5427bb895f320cf)
---
 clang/lib/Format/UnwrappedLineParser.cpp | 2 +-
 clang/unittests/Format/FormatTest.cpp    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 28e2954b5beba..852437b9390fc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2465,7 +2465,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         const auto *PrevPrev = Prev ? Prev->getPreviousNonComment() : nullptr;
         const bool Blacklisted =
             PrevPrev &&
-            (PrevPrev->is(tok::kw___attribute) ||
+            (PrevPrev->isOneOf(tok::kw___attribute, tok::kw_decltype) ||
              (SeenEqual &&
               (PrevPrev->isOneOf(tok::kw_if, tok::kw_while) ||
                PrevPrev->endsSequence(tok::kw_constexpr, tok::kw_if))));
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index ed0c5e64f4474..271778b5bb9e6 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26268,6 +26268,7 @@ TEST_F(FormatTest, RemoveParentheses) {
 
   Style.RemoveParentheses = FormatStyle::RPS_MultipleParentheses;
   verifyFormat("int x __attribute__((aligned(16))) = 0;", Style);
+  verifyFormat("decltype((foo->bar)) baz;", Style);
   verifyFormat("class __declspec(dllimport) X {};",
                "class __declspec((dllimport)) X {};", Style);
   verifyFormat("int x = (({ 0; }));", "int x = ((({ 0; })));", Style);

From 5a60d716ca5b9c713b6076f66fdb09f000c5c9f8 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 15 Aug 2023 16:55:32 -0700
Subject: [PATCH 57/92] [CMake] Add a few more missing dependencies on
 ClangDriverOptions

This often breaks modules-enabled bootstrap builds.

(cherry picked from commit 1e4d6122cda6529781ecf467c2ae84e5dd41acdf)
---
 clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt | 1 +
 clang/tools/clang-fuzzer/CMakeLists.txt          | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
index 5293f5e0a522d..0326798e3a174 100644
--- a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt
@@ -26,4 +26,5 @@ add_clang_library(clangStaticAnalyzerFrontend
 
   DEPENDS
   omp_gen
+  ClangDriverOptions
   )
diff --git a/clang/tools/clang-fuzzer/CMakeLists.txt b/clang/tools/clang-fuzzer/CMakeLists.txt
index e68ed8bbcb069..2b9720ee627cb 100644
--- a/clang/tools/clang-fuzzer/CMakeLists.txt
+++ b/clang/tools/clang-fuzzer/CMakeLists.txt
@@ -115,6 +115,9 @@ add_clang_executable(clang-fuzzer
   EXCLUDE_FROM_ALL
   ${DUMMY_MAIN}
   ClangFuzzer.cpp
+
+  DEPENDS
+  ClangDriverOptions
   )
 
 target_link_libraries(clang-fuzzer
@@ -127,6 +130,9 @@ add_clang_executable(clang-objc-fuzzer
   EXCLUDE_FROM_ALL
   ${DUMMY_MAIN}
   ClangObjectiveCFuzzer.cpp
+
+  DEPENDS
+  ClangDriverOptions
   )
 
 target_link_libraries(clang-objc-fuzzer

From b8444b613b3aa465b9038a0e043988a011abbbf0 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 10 Aug 2023 07:22:45 -0400
Subject: [PATCH 58/92] Silently accept -Wgnu-empty-initializer

https://github.com/llvm/llvm-project/commit/5d8aaad4452f60ba8902e921d9bed606713a8f26
removed the warning group as the functionality is no longer a GNU
extension. However, users have asked for the warning group to be
supported so that code transitioning from Clang 16 to Clang 17 has an
easier migration path when compiling with -Werror. This patch restores
the warning group, but as an ignored warning group because the
functionality is now always considered to be a C extension rather than
a GNU extension. This allows users to do:

  -Werror -pedantic -Wno-gnu-empty-intializer -Wno-c2x-extensions

to silence the diagnostics in both Clang 16 and Clang 17.

Fixes https://github.com/llvm/llvm-project/issues/64357
Differential Revision: https://reviews.llvm.org/D157503

(cherry picked from commit 151214b40d869455666ca76548a9e3ad639f79de)
---
 clang/include/clang/Basic/DiagnosticGroups.td |  1 +
 clang/test/Sema/empty-init.c                  | 30 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 clang/test/Sema/empty-init.c

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7b4d415bf0649..26bc88a980e4f 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -20,6 +20,7 @@ def DeprecatedStaticAnalyzerFlag : DiagGroup<"deprecated-static-analyzer-flag">;
 // Empty DiagGroups are recognized by clang but ignored.
 def ODR : DiagGroup<"odr">;
 def : DiagGroup<"abi">;
+def : DiagGroup<"gnu-empty-initializer">; // Now a C extension, not GNU.
 def AbsoluteValue : DiagGroup<"absolute-value">;
 def MisspelledAssumption : DiagGroup<"misspelled-assumption">;
 def UnknownAssumption : DiagGroup<"unknown-assumption">;
diff --git a/clang/test/Sema/empty-init.c b/clang/test/Sema/empty-init.c
new file mode 100644
index 0000000000000..8cb4a77710c2b
--- /dev/null
+++ b/clang/test/Sema/empty-init.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -std=c2x -Wall -pedantic -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -fsyntax-only -verify=c2x
+// RUN: %clang_cc1 %s -std=c2x -Wpre-c2x-compat -Wno-gnu-empty-initializer -fsyntax-only -verify=c2x
+// RUN: %clang_cc1 %s -std=c2x -Wgnu-empty-initializer -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c17 -Wall -pedantic -fsyntax-only -verify=c2x-ext
+// RUN: %clang_cc1 %s -std=c17 -Wgnu-empty-initializer -fsyntax-only -verify=good
+// RUN: %clang_cc1 %s -std=c17 -Wc2x-extensions -fsyntax-only -verify=c2x-ext
+// RUN: %clang_cc1 %s -std=c17 -Wpre-c2x-compat -fsyntax-only -verify=good
+
+// good-no-diagnostics
+
+// Empty brace initialization used to be a GNU extension, but the feature was
+// added to C2x. We now treat empty initialization as a C extension rather than
+// a GNU extension. Thus, -Wgnu-empty-initializer is always silently ignored.
+
+struct S {
+  int a;
+};
+
+struct S s = {};     /* c2x-warning {{use of an empty initializer is incompatible with C standards before C2x}}
+                        c2x-ext-warning {{use of an empty initializer is a C2x extension}}
+                      */
+
+void func(void) {
+  struct S s2 = {};  /* c2x-warning {{use of an empty initializer is incompatible with C standards before C2x}}
+                        c2x-ext-warning {{use of an empty initializer is a C2x extension}}
+                      */
+  (void)s2;
+}
+

From b8ea78cf4495ddf30e5396b7a0f2f0715bae08bb Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 28 Aug 2023 09:38:27 -0700
Subject: [PATCH 59/92] =?UTF-8?q?workflows:=20Fix=20libclang-abi=20test=20?=
 =?UTF-8?q?after=20update=20to=20use=20download-artifac=E2=80=A6=20(#64877?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

workflows: Fix libclang-abi test after update to use download-artifact v3
---
 .github/workflows/libclang-abi-tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 41896d4392885..155a1fcda7b3a 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -146,10 +146,12 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: build-baseline
+          path: build-baseline
       - name: Download latest
         uses: actions/download-artifact@v3
         with:
           name: build-latest
+          path: build-latest
 
       - name: Install abi-compliance-checker
         run: sudo apt-get install abi-compliance-checker

From cfd6f0fb9d58b71ec2bd242e58be1c9522cec260 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 21 Aug 2023 12:20:37 +0700
Subject: [PATCH 60/92] [clang] Set FP options in Sema when instantiating
 CompoundStmt

When an expression is instantiated, TreeTransform skips ImplicitCastExpr
nodes, assuming they are recreated when the instantiated expression is
built. It breaks functions that use non-default floating-point options,
because they are kept in these ImplicitCastExprs. In this case the
recreated ImplicitCastExpr takes FP options from the current Sema state
and not from AST node.

To fix this issue the FP options in Sema object are set when a compound
statement is cloned in TreeTransform.

This change fixes https://github.com/llvm/llvm-project/issues/64605
([Regression 16 -> 17] Template instantiation ignores FENV_ACCESS being
ON for both definition and instantiation).

Differential Revision: https://reviews.llvm.org/D158158

(cherry picked from commit 0baf85c331090fbe2d2b42214ed0664d55feb0b5)
---
 clang/lib/Sema/TreeTransform.h        |  4 ++++
 clang/test/SemaCXX/template-64605.cpp | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 clang/test/SemaCXX/template-64605.cpp

diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 10b3587885e39..097e81ea7d45a 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7478,6 +7478,10 @@ StmtResult
 TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
                                               bool IsStmtExpr) {
   Sema::CompoundScopeRAII CompoundScope(getSema());
+  Sema::FPFeaturesStateRAII FPSave(getSema());
+  if (S->hasStoredFPFeatures())
+    getSema().resetFPOptions(
+        S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
   const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
new file mode 100644
index 0000000000000..b13acbf2ae566
--- /dev/null
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+
+// https://github.com/llvm/llvm-project/issues/64605
+
+#pragma STDC FENV_ACCESS ON
+template <typename>
+int b_64605() {
+  int x;
+  if ((float)0xFFFFFFFF != (float)0x100000000) {
+    x = 1;
+  }
+  return x;
+}
+int f() { return b_64605<void>(); }
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
+
+// CHECK:      FunctionDecl {{.*}} b_64605 'int ()' implicit_instantiation
+// CHECK-NEXT: TemplateArgument type 'void'
+
+// CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
+// CHECK-NEXT: IntegerLiteral {{.*}} 4294967295

From 78447a8ad57ac62bcc40bd6c865b071e3d686ec1 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 21 Aug 2023 13:20:22 +0700
Subject: [PATCH 61/92] [clang] Run test for concrete target

The test clang/test/SemaCXX/template-64605.cpp uses pragma FENV_ACCESS,
which is not supported on all targets. Restrict it to x86_64 only.

(cherry picked from commit 73e5a70e676850b79f196e01e2194a2485041584)
---
 clang/test/SemaCXX/template-64605.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
index b13acbf2ae566..99ccbfdc27f1c 100644
--- a/clang/test/SemaCXX/template-64605.cpp
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump -ast-dump-filter=b_64605 %s | FileCheck %s
 
 // https://github.com/llvm/llvm-project/issues/64605
 

From 45c677d8c62b731df617181e5019316d0b2e1820 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Wed, 30 Aug 2023 12:32:35 +0700
Subject: [PATCH 62/92] [clang][test] Make check pattern shorter

A check pattern in clang/test/SemaCXX/template-64605.cpp contains template
specialization kind (the text "implicit_instantiation"). It does not need to
be checked and can be safely removed.

Presence of this text in the check pattern prevents from backporting some
commits to the release branch: https://github.com/llvm/llvm-project/issues/64605.
It has only recently been printed and the relevant commit is not present in
the release/17.x branch.

(cherry picked from commit 8859c644ede4898f90f77dcad2286de08a9ba62e)
---
 clang/test/SemaCXX/template-64605.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/template-64605.cpp b/clang/test/SemaCXX/template-64605.cpp
index 99ccbfdc27f1c..9d7f8d4100171 100644
--- a/clang/test/SemaCXX/template-64605.cpp
+++ b/clang/test/SemaCXX/template-64605.cpp
@@ -16,7 +16,7 @@ int f() { return b_64605<void>(); }
 // CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1
 // CHECK-NEXT: IntegerLiteral {{.*}} 4294967295
 
-// CHECK:      FunctionDecl {{.*}} b_64605 'int ()' implicit_instantiation
+// CHECK:      FunctionDecl {{.*}} b_64605 'int ()'
 // CHECK-NEXT: TemplateArgument type 'void'
 
 // CHECK:      ImplicitCastExpr {{.*}} 'float' <IntegralToFloating> RoundingMath=1 AllowFEnvAccess=1

From 08d720da6b1e0734e8ce137e8ead004dec63280b Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Fri, 25 Aug 2023 11:29:41 -0500
Subject: [PATCH 63/92] [PowerPC] Update V17.0.0 release notes

---
 clang/docs/ReleaseNotes.rst | 48 +++++++++++++++++++++++++++++++++----
 llvm/docs/ReleaseNotes.rst  | 48 +++++++++++++++++++++++++++++++++----
 2 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 180152417a490..76cc074dede76 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -273,6 +273,10 @@ Non-comprehensive list of changes in this release
   types. This allows access to ``llvm.nearbyint`` for arbitrary
   floating-point and vector of floating-point types.
 - Clang AST matcher now matches concept declarations with `conceptDecl`.
+- Clang now supports more GCC stdio builtins: ``__builtin_vprintf``, ``__builtin_vfprintf``,
+  ``__builtin_fscanf``, ``__builtin_scanf``, ``__builtin_sscanf``, ``__builtin_vfscanf``,
+  ``__builtin_vscanf``, ``__builtin_vsscanf``.
+
 
 New Compiler Flags
 ------------------
@@ -293,7 +297,11 @@ New Compiler Flags
 - ``-print-multi-flags-experimental`` prints the flags used for multilib
   selection. See `the multilib docs <https://clang.llvm.org/docs/Multilib.html>`_
   for more details.
-
+- ``-maix32`` and ``-maix64`` are new GCC compatibility flags that select the
+  bitmode to target on AIX.
+- ``-p`` is a new GCC compatibility flag for AIX and Linux which works
+  similarly to ``-pg`` by writing profile information, but targets the ``prof``
+  tool as opposed to the ``gprof`` tool.
 
 Deprecated Compiler Flags
 -------------------------
@@ -713,6 +721,14 @@ Bug Fixes in This Version
   The current solution may bring performance regressions if the awaiters have
   non-static data members. See
   `#64945 <https://github.com/llvm/llvm-project/issues/64945>`_ for details.
+- Clang now correctly diagnoses ``function_needs_feature`` when always_inline
+  callee has incompatible target features with caller.
+- Removed the linking of libraries when ``-r`` is passed to the driver on AIX.
+- Fixed an Itanium ABI bug where we force exactly two-byte alignment on member
+  functions to reserve a bit in function pointers for identifying pointers to
+  virtual member functions even if the target required a greater function
+  alignment and/or did not have function pointers which point to function entry
+  points (i.e., uses function descriptor objects instead).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -991,10 +1007,19 @@ CUDA Support
 
 AIX Support
 ^^^^^^^^^^^
-- Add an AIX-only link-time option, `-mxcoff-build-id=0xHEXSTRING`, to allow users
-  to embed a hex id in their binary such that it's readable by the program itself.
-  This option is an alternative to the `--build-id=0xHEXSTRING` GNU linker option
-  which is currently not supported by the AIX linker.
+- Enabled ThinLTO support. Minimum OS requirement is AIX 7.2 TL5 SP6 or
+  the upcoming AIX 7.3 TL2.
+
+- Enabled integrated assembler (``-f[no-]integrated-as``) for LTO. LTO now
+  defaults to the integrated assembler.
+
+- Enabled Clang-based instrumented profiling
+  (``-fprofile-instr-[generate|use]``).
+
+- Added an AIX-only link-time option, ``-mxcoff-build-id=0xHEXSTRING``, to allow
+  users to embed a hex id in their binary such that it's readable by the program
+  itself. This option is an alternative to the ``--build-id=0xHEXSTRING`` GNU
+  linker option, which is currently not supported by the AIX linker.
 
 - Introduced the ``-mxcoff-roptr`` option to place constant objects with
   relocatable address values in the read-only data section. This option should
@@ -1003,6 +1028,14 @@ AIX Support
   read-only data sections with relocatable address values that resolve to
   imported symbols are made writable.
 
+- Implemented the ``-frecord-command-line`` option on AIX, which saves the
+  command-line options used from compiling a source file to the corresponding
+  object file or binary file.
+
+- Added a new linker option, ``-K``, that is used to align the header, text,
+  data, and loader sections of the output file so that each section begins on
+  a page boundary.
+
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 - Shared library support (and PIC code generation) for WebAssembly is no longer
@@ -1019,6 +1052,11 @@ AVR Support
   of ``USHRT_MAX`` is now ``unsigned int`` instead of ``int``, as required by
   the C standard.
 
+PowerPC Support
+^^^^^^^^^^^^^^^
+- Clang now emits errors when it detects incompatible target features for
+  PowerPC builtins.
+
 DWARF Support in Clang
 ----------------------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index bd12d8c3964f6..0cb7a6266f1ab 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -230,19 +230,57 @@ Changes to the MIPS Backend
 Changes to the PowerPC Backend
 ------------------------------
 
+* Improved code sequence of materializing 64-bit immediate numbers, expanding
+  ``is.fpclass`` intrinsic and forwarding stores.
+* Implemented DFP instructions (for use via inline asm).
+* Improved code gen for vector add.
+* Added ability to show statistics of number of entries in the TOC.
+* Added Binary Coded Decimal Assist instructions (for use via inline asm).
+* Added basic support for vector functions in GlobalISel.
+* Added additional X-Form load and store instruction generation for TLS accesses.
+* PPC64LE backend is added to JITLink.
+* Added various bug fixes and optimizations.
+* Added function pointer alignment to the DataLayout for Power, which lets us
+  make more informed choices about what this alignment defaults to for various 
+  purposes (e.g., C++ pointers to member). If the target ABI uses function
+  descriptor objects, this is the alignment we would emit the descriptor with.
+  Otherwise, a function pointer points to a global entry point, so this is at
+  least the alignment for code on Power (i.e., 4-bytes).
+
+AIX Support/improvements:
+
+
 * A new option ``-mxcoff-roptr`` is added to ``clang`` and ``llc``. When this
   option is present, constant objects with relocatable address values are put
-  into the RO data section. This option should be used with the ``-fdata-sections``
-  option, and is not supported with ``-fno-data-sections``. The option is
-  only supported on AIX.
-* On AIX, teach the profile runtime to check for a build-id string; such string
-  can be created by the -mxcoff-build-id option.
+  into the RO data section. This option should be used with the
+  ``-fdata-sections`` option, and is not supported with ``-fno-data-sections``.
+
+* Taught the profile runtime to check for a build-id string. Build-id strings
+  can be created via the ``-mxcoff-build-id`` option.
+
 * Removed ``-ppc-quadword-atomics`` which only affected lock-free quadword
   atomics on AIX. Now backend generates lock-free quadword atomics code on AIX
   by default. To support lock-free quadword atomics in libatomic, the OS level
   must be at least AIX 7.2 TL5 SP3 with libc++.rte of version 17.1.1 or above
   installed.
 
+* Integrated assembler is enabled by default on AIX.
+* System assembler is always used to compile assembly files on AIX.
+* Added support for local-exec TLS.
+* Added a new option, ``--traceback-table``, to ``llvm-objdump`` to print out
+  the traceback table information for XCOFF object files.
+* Added ``llvm-ar`` object mode options ``-X32``, ``-X64``, ``-X32-64``,
+  and ``-Xany``.
+* Changed the default name of the text-section csect to be an empty string
+  instead of ``.text``. This change does not affect the behaviour
+  of the program.
+* Fixed a problem when the personality routine for the legacy AIX ``xlclang++``
+  compiler uses the stack slot to pass the exception object to the landing pad.
+  Runtime routine ``__xlc_exception_handle()`` invoked by the landing pad to
+  retrieve the exception object now skips frames not associated with functions
+  that are C++ EH-aware because the compiler sometimes generates a wrapper of
+  ``__xlc_exception_handle()`` for optimization purposes.
+
 Changes to the RISC-V Backend
 -----------------------------
 

From ad5ed49a142ba9c69b6e24fdae11d10eca2287f3 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 25 Aug 2023 15:53:39 +0200
Subject: [PATCH 64/92] [mlir][memref] Fix crash in
 SubViewReturnTypeCanonicalizer

`SubViewReturnTypeCanonicalizer` is used by `OpWithOffsetSizesAndStridesConstantArgumentFolder`, which folds constant SSA value (dynamic) sizes into static sizes. The previous implementation crashed when a dynamic size was folded into a static `1` dimension, which was then mistaken as a rank reduction.

Differential Revision: https://reviews.llvm.org/D158721
---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp   | 64 +++++++++++-----------
 mlir/test/Dialect/MemRef/canonicalize.mlir | 17 +++++-
 2 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 5f35adf0ddaab..658756c6a6e61 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -31,23 +31,17 @@ namespace {
 namespace saturated_arith {
 struct Wrapper {
   static Wrapper stride(int64_t v) {
-    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0}
-                                                    : Wrapper{false, v};
+    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
   static Wrapper offset(int64_t v) {
-    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0}
-                                                    : Wrapper{false, v};
+    return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
   static Wrapper size(int64_t v) {
     return (ShapedType::isDynamic(v)) ? Wrapper{true, 0} : Wrapper{false, v};
   }
-  int64_t asOffset() {
-    return saturated ? ShapedType::kDynamic : v;
-  }
+  int64_t asOffset() { return saturated ? ShapedType::kDynamic : v; }
   int64_t asSize() { return saturated ? ShapedType::kDynamic : v; }
-  int64_t asStride() {
-    return saturated ? ShapedType::kDynamic : v;
-  }
+  int64_t asStride() { return saturated ? ShapedType::kDynamic : v; }
   bool operator==(Wrapper other) {
     return (saturated && other.saturated) ||
            (!saturated && !other.saturated && v == other.v);
@@ -732,8 +726,7 @@ bool CastOp::canFoldIntoConsumerOp(CastOp castOp) {
   for (auto it : llvm::zip(sourceStrides, resultStrides)) {
     auto ss = std::get<0>(it), st = std::get<1>(it);
     if (ss != st)
-      if (ShapedType::isDynamic(ss) &&
-          !ShapedType::isDynamic(st))
+      if (ShapedType::isDynamic(ss) && !ShapedType::isDynamic(st))
         return false;
   }
 
@@ -766,8 +759,7 @@ bool CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
       // same. They are also compatible if either one is dynamic (see
       // description of MemRefCastOp for details).
       auto checkCompatible = [](int64_t a, int64_t b) {
-        return (ShapedType::isDynamic(a) ||
-                ShapedType::isDynamic(b) || a == b);
+        return (ShapedType::isDynamic(a) || ShapedType::isDynamic(b) || a == b);
       };
       if (!checkCompatible(aOffset, bOffset))
         return false;
@@ -1890,8 +1882,7 @@ LogicalResult ReinterpretCastOp::verify() {
   // Match offset in result memref type and in static_offsets attribute.
   int64_t expectedOffset = getStaticOffsets().front();
   if (!ShapedType::isDynamic(resultOffset) &&
-      !ShapedType::isDynamic(expectedOffset) &&
-      resultOffset != expectedOffset)
+      !ShapedType::isDynamic(expectedOffset) && resultOffset != expectedOffset)
     return emitError("expected result type with offset = ")
            << expectedOffset << " instead of " << resultOffset;
 
@@ -2945,18 +2936,6 @@ static MemRefType getCanonicalSubViewResultType(
                          nonRankReducedType.getMemorySpace());
 }
 
-/// Compute the canonical result type of a SubViewOp. Call `inferResultType`
-/// to deduce the result type. Additionally, reduce the rank of the inferred
-/// result type if `currentResultType` is lower rank than `sourceType`.
-static MemRefType getCanonicalSubViewResultType(
-    MemRefType currentResultType, MemRefType sourceType,
-    ArrayRef<OpFoldResult> mixedOffsets, ArrayRef<OpFoldResult> mixedSizes,
-    ArrayRef<OpFoldResult> mixedStrides) {
-  return getCanonicalSubViewResultType(currentResultType, sourceType,
-                                       sourceType, mixedOffsets, mixedSizes,
-                                       mixedStrides);
-}
-
 Value mlir::memref::createCanonicalRankReducingSubViewOp(
     OpBuilder &b, Location loc, Value memref, ArrayRef<int64_t> targetShape) {
   auto memrefType = llvm::cast<MemRefType>(memref.getType());
@@ -3109,9 +3088,32 @@ struct SubViewReturnTypeCanonicalizer {
   MemRefType operator()(SubViewOp op, ArrayRef<OpFoldResult> mixedOffsets,
                         ArrayRef<OpFoldResult> mixedSizes,
                         ArrayRef<OpFoldResult> mixedStrides) {
-    return getCanonicalSubViewResultType(op.getType(), op.getSourceType(),
-                                         mixedOffsets, mixedSizes,
-                                         mixedStrides);
+    // Infer a memref type without taking into account any rank reductions.
+    MemRefType nonReducedType = cast<MemRefType>(SubViewOp::inferResultType(
+        op.getSourceType(), mixedOffsets, mixedSizes, mixedStrides));
+
+    // Directly return the non-rank reduced type if there are no dropped dims.
+    llvm::SmallBitVector droppedDims = op.getDroppedDims();
+    if (droppedDims.empty())
+      return nonReducedType;
+
+    // Take the strides and offset from the non-rank reduced type.
+    auto [nonReducedStrides, offset] = getStridesAndOffset(nonReducedType);
+
+    // Drop dims from shape and strides.
+    SmallVector<int64_t> targetShape;
+    SmallVector<int64_t> targetStrides;
+    for (int64_t i = 0; i < static_cast<int64_t>(mixedSizes.size()); ++i) {
+      if (droppedDims.test(i))
+        continue;
+      targetStrides.push_back(nonReducedStrides[i]);
+      targetShape.push_back(nonReducedType.getDimSize(i));
+    }
+
+    return MemRefType::get(targetShape, nonReducedType.getElementType(),
+                           StridedLayoutAttr::get(nonReducedType.getContext(),
+                                                  offset, targetStrides),
+                           nonReducedType.getMemorySpace());
   }
 };
 
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index b65426cad30b6..df66705e83e0e 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -931,7 +931,7 @@ func.func @fold_multiple_memory_space_cast(%arg : memref<?xf32>) -> memref<?xf32
 
 // -----
 
-// CHECK-lABEL: func @ub_negative_alloc_size
+// CHECK-LABEL: func private @ub_negative_alloc_size
 func.func private @ub_negative_alloc_size() -> memref<?x?x?xi1> {
   %idx1 = index.constant 1
   %c-2 = arith.constant -2 : index
@@ -940,3 +940,18 @@ func.func private @ub_negative_alloc_size() -> memref<?x?x?xi1> {
   %alloc = memref.alloc(%c15, %c-2, %idx1) : memref<?x?x?xi1>
   return %alloc : memref<?x?x?xi1>
 }
+
+// -----
+
+// CHECK-LABEL: func @subview_rank_reduction(
+//  CHECK-SAME:     %[[arg0:.*]]: memref<1x384x384xf32>, %[[arg1:.*]]: index
+func.func @subview_rank_reduction(%arg0: memref<1x384x384xf32>, %idx: index)
+    -> memref<?x?xf32, strided<[384, 1], offset: ?>> {
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[subview:.*]] = memref.subview %[[arg0]][0, %[[arg1]], %[[arg1]]] [1, 1, %[[arg1]]] [1, 1, 1] : memref<1x384x384xf32> to memref<1x?xf32, strided<[384, 1], offset: ?>>
+  // CHECK: %[[cast:.*]] = memref.cast %[[subview]] : memref<1x?xf32, strided<[384, 1], offset: ?>> to memref<?x?xf32, strided<[384, 1], offset: ?>>
+  %0 = memref.subview %arg0[0, %idx, %idx] [1, %c1, %idx] [1, 1, 1]
+      : memref<1x384x384xf32> to memref<?x?xf32, strided<[384, 1], offset: ?>>
+  // CHECK: return %[[cast]]
+  return %0 : memref<?x?xf32, strided<[384, 1], offset: ?>>
+}

From 94f348b7842a2d3a00b5a7d6641b394c95486252 Mon Sep 17 00:00:00 2001
From: "Balaji V. Iyer" <bviyer@gmail.com>
Date: Fri, 25 Aug 2023 15:15:28 -0700
Subject: [PATCH 65/92] [mlir][math] Modify math.powf to handle negative bases.

Powf expansion currently returns NaN when the base is negative.
This is because taking natural log of a negative number gives
NaN. This patch will square the base and half the exponent, thereby
getting around the negative base problem.

Reviewed By: rsuderman

Differential Revision: https://reviews.llvm.org/D158797
---
 .../Math/Transforms/ExpandPatterns.cpp        | 21 ++++++++++++++++---
 mlir/test/Dialect/Math/expand-math.mlir       | 17 ++++++++++++---
 .../test-expand-math-approx.mlir              | 11 ++--------
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index ee8f23cf362b6..98c97fdc2c090 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -168,11 +168,26 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
   Value operandA = op.getOperand(0);
   Value operandB = op.getOperand(1);
   Type opType = operandA.getType();
+  Value zero = createFloatConst(op->getLoc(), opType, 0.00, rewriter);
+  Value two = createFloatConst(op->getLoc(), opType, 2.00, rewriter);
+  Value negOne = createFloatConst(op->getLoc(), opType, -1.00, rewriter);
+  Value opASquared = b.create<arith::MulFOp>(opType, operandA, operandA);
+  Value opBHalf = b.create<arith::DivFOp>(opType, operandB, two);
 
-  Value logA = b.create<math::LogOp>(opType, operandA);
-  Value mult = b.create<arith::MulFOp>(opType, logA, operandB);
+  Value logA = b.create<math::LogOp>(opType, opASquared);
+  Value mult = b.create<arith::MulFOp>(opType, opBHalf, logA);
   Value expResult = b.create<math::ExpOp>(opType, mult);
-  rewriter.replaceOp(op, expResult);
+  Value negExpResult = b.create<arith::MulFOp>(opType, expResult, negOne);
+  Value remainder = b.create<arith::RemFOp>(opType, operandB, two);
+  Value negCheck =
+      b.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, operandA, zero);
+  Value oddPower =
+      b.create<arith::CmpFOp>(arith::CmpFPredicate::ONE, remainder, zero);
+  Value oddAndNeg = b.create<arith::AndIOp>(op->getLoc(), oddPower, negCheck);
+
+  Value res = b.create<arith::SelectOp>(op->getLoc(), oddAndNeg, negExpResult,
+                                        expResult);
+  rewriter.replaceOp(op, res);
   return success();
 }
 
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index c28e2141db061..4cd6461102079 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -222,10 +222,21 @@ func.func @roundf_func(%a: f32) -> f32 {
 // CHECK-LABEL:   func @powf_func
 // CHECK-SAME:    ([[ARG0:%.+]]: f64, [[ARG1:%.+]]: f64)
 func.func @powf_func(%a: f64, %b: f64) ->f64 {
-  // CHECK-DAG: [[LOG:%.+]] = math.log [[ARG0]]
-  // CHECK-DAG: [[MULT:%.+]] = arith.mulf [[LOG]], [[ARG1]]
+  // CHECK-DAG = [[CST0:%.+]] = arith.constant 0.000000e+00
+  // CHECK-DAG: [[TWO:%.+]] = arith.constant 2.000000e+00
+  // CHECK-DAG: [[NEGONE:%.+]] = arith.constant -1.000000e+00
+  // CHECK-DAG: [[SQR:%.+]] = arith.mulf [[ARG0]], [[ARG0]]
+  // CHECK-DAG: [[HALF:%.+]] = arith.divf [[ARG1]], [[TWO]] 
+  // CHECK-DAG: [[LOG:%.+]] = math.log [[SQR]]
+  // CHECK-DAG: [[MULT:%.+]] = arith.mulf [[HALF]], [[LOG]]
   // CHECK-DAG: [[EXPR:%.+]] = math.exp [[MULT]]
-  // CHECK: return [[EXPR]]
+  // CHECK-DAG: [[NEGEXPR:%.+]] = arith.mulf [[EXPR]], [[NEGONE]]
+  // CHECK-DAG: [[REMF:%.+]] = arith.remf [[ARG1]], [[TWO]]
+  // CHECK-DAG: [[CMPNEG:%.+]] = arith.cmpf olt, [[ARG0]]
+  // CHECK-DAG: [[CMPZERO:%.+]] = arith.cmpf one, [[REMF]]
+  // CHECK-DAG: [[AND:%.+]] = arith.andi [[CMPZERO]], [[CMPNEG]]
+  // CHECK-DAG: [[SEL:%.+]] = arith.select [[AND]], [[NEGEXPR]], [[EXPR]]
+  // CHECK: return [[SEL]]
   %ret = math.powf %a, %b : f64
   return %ret : f64
 }
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 30f30def56fdd..847c41fec9135 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -195,7 +195,7 @@ func.func @powf() {
   %a_p = arith.constant 2.0 : f64
   call @func_powff64(%a, %a_p) : (f64, f64) -> ()
 
-  // CHECK-NEXT: nan
+  // CHECK-NEXT: -27
   %b   = arith.constant -3.0 : f64
   %b_p = arith.constant 3.0 : f64
   call @func_powff64(%b, %b_p) : (f64, f64) -> ()
@@ -220,16 +220,9 @@ func.func @powf() {
   %f_p  = arith.constant 1.2 : f64
   call @func_powff64(%f, %f_p) : (f64, f64) -> ()
 
-  // CHECK-NEXT: nan
-  %g    = arith.constant 0xff80000000000000 : f64
-  call @func_powff64(%g, %g) : (f64, f64) -> ()
-
-  // CHECK-NEXT: nan
-  %h = arith.constant 0x7fffffffffffffff : f64
-  call @func_powff64(%h, %h) : (f64, f64) -> ()
-
   // CHECK-NEXT: nan
   %i = arith.constant 1.0 : f64
+  %h = arith.constant 0x7fffffffffffffff : f64
   call @func_powff64(%i, %h) : (f64, f64) -> ()
 
   // CHECK-NEXT: inf

From cc7e24c7a723fee9c4209663ea1517aeba34e42a Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 26 Aug 2023 09:57:43 +0200
Subject: [PATCH 66/92] [mlir] Fix crash when adding nested dialect extensions

A dialect extension can add additional dialect extensions in its `apply` function. This used to crash when the vector of `extensions` was internally reallocated while it is being iterated over.

Differential Revision: https://reviews.llvm.org/D158838
---
 mlir/lib/IR/Dialect.cpp           | 13 +++++----
 mlir/unittests/IR/DialectTest.cpp | 46 +++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index 501f52b83e026..1de49769974ac 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -125,7 +125,8 @@ DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
     MLIRContext *ctx, TypeID interfaceKind, StringRef interfaceName) {
   for (auto *dialect : ctx->getLoadedDialects()) {
 #ifndef NDEBUG
-  dialect->handleUseOfUndefinedPromisedInterface(interfaceKind, interfaceName);
+    dialect->handleUseOfUndefinedPromisedInterface(interfaceKind,
+                                                   interfaceName);
 #endif
     if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
       interfaces.insert(interface);
@@ -243,8 +244,9 @@ void DialectRegistry::applyExtensions(Dialect *dialect) const {
     extension.apply(ctx, requiredDialects);
   };
 
-  for (const auto &extension : extensions)
-    applyExtension(*extension);
+  // Note: Additional extensions may be added while applying an extension.
+  for (int i = 0; i < static_cast<int>(extensions.size()); ++i)
+    applyExtension(*extensions[i]);
 }
 
 void DialectRegistry::applyExtensions(MLIRContext *ctx) const {
@@ -264,8 +266,9 @@ void DialectRegistry::applyExtensions(MLIRContext *ctx) const {
     extension.apply(ctx, requiredDialects);
   };
 
-  for (const auto &extension : extensions)
-    applyExtension(*extension);
+  // Note: Additional extensions may be added while applying an extension.
+  for (int i = 0; i < static_cast<int>(extensions.size()); ++i)
+    applyExtension(*extensions[i]);
 }
 
 bool DialectRegistry::isSubsetOf(const DialectRegistry &rhs) const {
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
index a2b58bf731976..e99d46e6d2643 100644
--- a/mlir/unittests/IR/DialectTest.cpp
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -136,4 +136,50 @@ TEST(Dialect, RepeatedDelayedRegistration) {
   EXPECT_TRUE(testDialectInterface != nullptr);
 }
 
+namespace {
+/// A dummy extension that increases a counter when being applied and
+/// recursively adds additional extensions.
+struct DummyExtension : DialectExtension<DummyExtension, TestDialect> {
+  DummyExtension(int *counter, int numRecursive)
+      : DialectExtension(), counter(counter), numRecursive(numRecursive) {}
+
+  void apply(MLIRContext *ctx, TestDialect *dialect) const final {
+    ++(*counter);
+    DialectRegistry nestedRegistry;
+    for (int i = 0; i < numRecursive; ++i)
+      nestedRegistry.addExtension(
+          std::make_unique<DummyExtension>(counter, /*numRecursive=*/0));
+    // Adding additional extensions may trigger a reallocation of the
+    // `extensions` vector in the dialect registry.
+    ctx->appendDialectRegistry(nestedRegistry);
+  }
+
+private:
+  int *counter;
+  int numRecursive;
+};
+} // namespace
+
+TEST(Dialect, NestedDialectExtension) {
+  DialectRegistry registry;
+  registry.insert<TestDialect>();
+
+  // Add an extension that adds 100 more extensions.
+  int counter1 = 0;
+  registry.addExtension(std::make_unique<DummyExtension>(&counter1, 100));
+  // Add one more extension. This should not crash.
+  int counter2 = 0;
+  registry.addExtension(std::make_unique<DummyExtension>(&counter2, 0));
+
+  // Load dialect and apply extensions.
+  MLIRContext context(registry);
+  Dialect *testDialect = context.getOrLoadDialect<TestDialect>();
+  ASSERT_TRUE(testDialect != nullptr);
+
+  // Extensions may be applied multiple times. Make sure that each expected
+  // extension was applied at least once.
+  EXPECT_GE(counter1, 101);
+  EXPECT_GE(counter2, 1);
+}
+
 } // namespace

From b66219d735006fafeeb2b2a1194821daee2f7245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20B=C3=B6ck?= <markus.boeck02@gmail.com>
Date: Sat, 26 Aug 2023 16:10:52 +0200
Subject: [PATCH 67/92] [mlir] Fix infinite recursion in alias initializer

The alias initializer keeps a list of child indices around. When an alias is then marked as non-deferrable, all children are also marked non-deferrable.

This is currently done naively which leads to an infinite recursion if using mutable types or attributes containing a cycle.

This patch fixes this by adding an early return if the alias is already marked non-deferrable. Since this function is the only way to mark an alias as non-deferrable, it is guaranteed that if it is marked non-deferrable, all its children are as well, and it is not required to walk all the children.
This incidentally makes the non-deferrable marking also `O(n)` instead of `O(n^2)` (although not performance sensitive obviously).

Differential Revision: https://reviews.llvm.org/D158932
---
 mlir/lib/IR/AsmPrinter.cpp                 |  6 +++
 mlir/test/IR/recursive-type.mlir           | 12 +++++
 mlir/test/lib/Dialect/Test/TestDialect.cpp |  4 ++
 mlir/test/lib/Dialect/Test/TestTypeDefs.td | 22 ++++++++++
 mlir/test/lib/Dialect/Test/TestTypes.cpp   | 51 ++++++++++++++++++++++
 mlir/test/lib/Dialect/Test/TestTypes.h     |  6 +--
 6 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 325f986f97694..af41532670890 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1043,6 +1043,12 @@ std::pair<size_t, size_t> AliasInitializer::visitImpl(
 
 void AliasInitializer::markAliasNonDeferrable(size_t aliasIndex) {
   auto it = std::next(aliases.begin(), aliasIndex);
+
+  // If already marked non-deferrable stop the recursion.
+  // All children should already be marked non-deferrable as well.
+  if (!it->second.canBeDeferred)
+    return;
+
   it->second.canBeDeferred = false;
 
   // Propagate the non-deferrable flag to any child aliases.
diff --git a/mlir/test/IR/recursive-type.mlir b/mlir/test/IR/recursive-type.mlir
index bc9b2cdbea6b6..121ba095573ba 100644
--- a/mlir/test/IR/recursive-type.mlir
+++ b/mlir/test/IR/recursive-type.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -test-recursive-types | FileCheck %s
 
 // CHECK: !testrec = !test.test_rec<type_to_alias, test_rec<type_to_alias>>
+// CHECK: ![[$NAME:.*]] = !test.test_rec_alias<name, !test.test_rec_alias<name>>
+// CHECK: ![[$NAME2:.*]] = !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
 
 // CHECK-LABEL: @roundtrip
 func.func @roundtrip() {
@@ -12,6 +14,16 @@ func.func @roundtrip() {
   // into inifinite recursion.
   // CHECK: !testrec
   "test.dummy_op_for_roundtrip"() : () -> !test.test_rec<type_to_alias, test_rec<type_to_alias>>
+
+  // CHECK: () -> ![[$NAME]]
+  // CHECK: () -> ![[$NAME]]
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name, !test.test_rec_alias<name>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name, !test.test_rec_alias<name>>
+
+  // CHECK: () -> ![[$NAME2]]
+  // CHECK: () -> ![[$NAME2]]
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
+  "test.dummy_op_for_roundtrip"() : () -> !test.test_rec_alias<name2, tuple<!test.test_rec_alias<name2>, i32>>
   return
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 072f6ff4b84d3..debe733f59be4 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -312,6 +312,10 @@ struct TestOpAsmInterface : public OpAsmDialectInterface {
         return AliasResult::FinalAlias;
       }
     }
+    if (auto recAliasType = dyn_cast<TestRecursiveAliasType>(type)) {
+      os << recAliasType.getName();
+      return AliasResult::FinalAlias;
+    }
     return AliasResult::NoAlias;
   }
 
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 15dbd74aec118..2a8bdad8fb25d 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -369,4 +369,26 @@ def TestTypeElseAnchorStruct : Test_Type<"TestTypeElseAnchorStruct"> {
   let assemblyFormat = "`<` (`?`) : (struct($a, $b)^)? `>`";
 }
 
+def TestI32 : Test_Type<"TestI32"> {
+  let mnemonic = "i32";
+}
+
+def TestRecursiveAlias
+    : Test_Type<"TestRecursiveAlias", [NativeTypeTrait<"IsMutable">]> {
+  let mnemonic = "test_rec_alias";
+  let storageClass = "TestRecursiveTypeStorage";
+  let storageNamespace = "test";
+  let genStorageClass = 0;
+
+  let parameters = (ins "llvm::StringRef":$name);
+
+  let hasCustomAssemblyFormat = 1;
+
+  let extraClassDeclaration = [{
+    Type getBody() const;
+
+    void setBody(Type type);
+  }];
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 0633752067a14..20dc03a765269 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -482,3 +482,54 @@ void TestDialect::printType(Type type, DialectAsmPrinter &printer) const {
   SetVector<Type> stack;
   printTestType(type, printer, stack);
 }
+
+Type TestRecursiveAliasType::getBody() const { return getImpl()->body; }
+
+void TestRecursiveAliasType::setBody(Type type) { (void)Base::mutate(type); }
+
+StringRef TestRecursiveAliasType::getName() const { return getImpl()->name; }
+
+Type TestRecursiveAliasType::parse(AsmParser &parser) {
+  thread_local static SetVector<Type> stack;
+
+  StringRef name;
+  if (parser.parseLess() || parser.parseKeyword(&name))
+    return Type();
+  auto rec = TestRecursiveAliasType::get(parser.getContext(), name);
+
+  // If this type already has been parsed above in the stack, expect just the
+  // name.
+  if (stack.contains(rec)) {
+    if (failed(parser.parseGreater()))
+      return Type();
+    return rec;
+  }
+
+  // Otherwise, parse the body and update the type.
+  if (failed(parser.parseComma()))
+    return Type();
+  stack.insert(rec);
+  Type subtype;
+  if (parser.parseType(subtype))
+    return nullptr;
+  stack.pop_back();
+  if (!subtype || failed(parser.parseGreater()))
+    return Type();
+
+  rec.setBody(subtype);
+
+  return rec;
+}
+
+void TestRecursiveAliasType::print(AsmPrinter &printer) const {
+  thread_local static SetVector<Type> stack;
+
+  printer << "<" << getName();
+  if (!stack.contains(*this)) {
+    printer << ", ";
+    stack.insert(*this);
+    printer << getBody();
+    stack.pop_back();
+  }
+  printer << ">";
+}
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h
index c7d169d020d56..0ce86dd70ab90 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.h
+++ b/mlir/test/lib/Dialect/Test/TestTypes.h
@@ -91,9 +91,6 @@ struct FieldParser<std::optional<int>> {
 
 #include "TestTypeInterfaces.h.inc"
 
-#define GET_TYPEDEF_CLASSES
-#include "TestTypeDefs.h.inc"
-
 namespace test {
 
 /// Storage for simple named recursive types, where the type is identified by
@@ -150,4 +147,7 @@ class TestRecursiveType
 
 } // namespace test
 
+#define GET_TYPEDEF_CLASSES
+#include "TestTypeDefs.h.inc"
+
 #endif // MLIR_TESTTYPES_H

From 1b1113731fefb9d30f9167514a6cdf3b3f0ee007 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 29 Aug 2023 22:05:44 -0700
Subject: [PATCH 68/92] [RISCV] Fix assertion failure when zcmp extension is
 enabled.

Before accessing "getOpcode" thorugh machine instruction, check if the iterator
has reached the end of Machine basic block otherwise we will crash at the
assertion `!NodePtr->isKnownSentinel()`.

The above assertion is hit in "Prologue/Epilogue Insertion & Frame Finalization
pass".

Reviewed By: craig.topper, wangpc

Differential Revision: https://reviews.llvm.org/D158256

(cherry picked from commit fdef7952cbc26fd31d0d92a4f14dde58bc461fe9)
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |   3 +-
 .../RISCV/zcmp-prolog-epilog-crash.mir        | 158 ++++++++++++++++++
 2 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index cb2a49db92332..f312cc8129ddf 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -772,7 +772,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   if (FirstSPAdjustAmount)
     StackSize = FirstSPAdjustAmount;
 
-  if (RVFI->isPushable(MF) && MBBI->getOpcode() == RISCV::CM_POP) {
+  if (RVFI->isPushable(MF) && MBBI != MBB.end() &&
+      MBBI->getOpcode() == RISCV::CM_POP) {
     // Use available stack adjustment in pop instruction to deallocate stack
     // space.
     unsigned PushStack = RVFI->getRVPushRegs() * (STI.getXLen() / 8);
diff --git a/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir b/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir
new file mode 100644
index 0000000000000..64556ec0b343a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zcmp-prolog-epilog-crash.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# REQUIRES: asserts
+# RUN: llc  %s -o - -mtriple=riscv32 -mattr=+zcmp -target-abi ilp32 -run-pass=prologepilog \
+# RUN:   -simplify-mir -verify-machineinstrs | FileCheck %s
+
+--- |
+  define hidden void @f(fp128 %a) local_unnamed_addr #0 {
+  entry:
+    %0 = bitcast fp128 %a to i128
+    %and.i = lshr i128 %0, 112
+    %1 = trunc i128 %and.i to i32
+    %2 = and i32 %1, 32767
+    %or.i = or i128 poison, 5192296858534827628530496329220096
+    br label %if.end.i
+
+  if.end.i:                                         ; preds = %entry
+    br i1 poison, label %exit, label %if.then12.i
+
+  if.then12.i:                                      ; preds = %if.end.i
+    %sub13.i = sub nuw nsw i32 16495, %2
+    %sh_prom.i = zext i32 %sub13.i to i128
+    %shr14.i = lshr i128 %or.i, %sh_prom.i
+    %conv15.i = trunc i128 %shr14.i to i32
+    br label %exit
+
+  exit:                                             ; preds = %if.then12.i, %if.end.i
+    %retval.0.i = phi i32 [ %conv15.i, %if.then12.i ], [ -1, %if.end.i ]
+    ret void
+  }
+...
+---
+name:            f
+alignment:       2
+tracksRegLiveness: true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$x10' }
+frameInfo:
+  maxAlignment:    1
+  localFrameSize:  32
+  savePoint:       '%bb.2'
+  restorePoint:    '%bb.2'
+stack:
+  - { id: 0, size: 32, alignment: 1, local-offset: -32 }
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  ; CHECK-LABEL: name: f
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x10 = ADDI $x0, -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.end.i:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BNE $x0, $x0, %bb.3
+  ; CHECK-NEXT:   PseudoBR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then12.i:
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -32
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 32
+  ; CHECK-NEXT:   SB $x0, $x2, 31 :: (store (s8) into %stack.0 + 31)
+  ; CHECK-NEXT:   SB $x0, $x2, 30 :: (store (s8) into %stack.0 + 30)
+  ; CHECK-NEXT:   SB $x0, $x2, 29 :: (store (s8) into %stack.0 + 29)
+  ; CHECK-NEXT:   SB $x0, $x2, 28 :: (store (s8) into %stack.0 + 28)
+  ; CHECK-NEXT:   SB $x0, $x2, 27 :: (store (s8) into %stack.0 + 27)
+  ; CHECK-NEXT:   SB $x0, $x2, 26 :: (store (s8) into %stack.0 + 26)
+  ; CHECK-NEXT:   SB $x0, $x2, 25 :: (store (s8) into %stack.0 + 25)
+  ; CHECK-NEXT:   SB $x0, $x2, 24 :: (store (s8) into %stack.0 + 24)
+  ; CHECK-NEXT:   SB $x0, $x2, 23 :: (store (s8) into %stack.0 + 23)
+  ; CHECK-NEXT:   SB $x0, $x2, 22 :: (store (s8) into %stack.0 + 22)
+  ; CHECK-NEXT:   SB $x0, $x2, 21 :: (store (s8) into %stack.0 + 21)
+  ; CHECK-NEXT:   SB $x0, $x2, 20 :: (store (s8) into %stack.0 + 20)
+  ; CHECK-NEXT:   SB $x0, $x2, 19 :: (store (s8) into %stack.0 + 19)
+  ; CHECK-NEXT:   SB $x0, $x2, 18 :: (store (s8) into %stack.0 + 18)
+  ; CHECK-NEXT:   SB $x0, $x2, 17 :: (store (s8) into %stack.0 + 17)
+  ; CHECK-NEXT:   SB $x0, $x2, 16 :: (store (s8) into %stack.0 + 16)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 0 :: (store (s8) into %stack.0)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 4 :: (store (s8) into %stack.0 + 4)
+  ; CHECK-NEXT:   renamable $x11 = SRLI renamable $x10, 24
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 3 :: (store (s8) into %stack.0 + 3)
+  ; CHECK-NEXT:   renamable $x12 = SRLI renamable $x10, 16
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 2 :: (store (s8) into %stack.0 + 2)
+  ; CHECK-NEXT:   renamable $x13 = SRLI renamable $x10, 8
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 1 :: (store (s8) into %stack.0 + 1)
+  ; CHECK-NEXT:   SB renamable $x10, $x2, 8 :: (store (s8) into %stack.0 + 8)
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 7 :: (store (s8) into %stack.0 + 7)
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 6 :: (store (s8) into %stack.0 + 6)
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 5 :: (store (s8) into %stack.0 + 5)
+  ; CHECK-NEXT:   SB killed renamable $x10, $x2, 12 :: (store (s8) into %stack.0 + 12)
+  ; CHECK-NEXT:   SB renamable $x11, $x2, 11 :: (store (s8) into %stack.0 + 11)
+  ; CHECK-NEXT:   SB renamable $x12, $x2, 10 :: (store (s8) into %stack.0 + 10)
+  ; CHECK-NEXT:   SB renamable $x13, $x2, 9 :: (store (s8) into %stack.0 + 9)
+  ; CHECK-NEXT:   SB killed renamable $x11, $x2, 15 :: (store (s8) into %stack.0 + 15)
+  ; CHECK-NEXT:   SB killed renamable $x12, $x2, 14 :: (store (s8) into %stack.0 + 14)
+  ; CHECK-NEXT:   SB killed renamable $x13, $x2, 13 :: (store (s8) into %stack.0 + 13)
+  ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x2, 32
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.exit:
+  ; CHECK-NEXT:   PseudoRET
+  bb.0.entry:
+    liveins: $x10
+
+    renamable $x10 = ADDI $x0, -1
+
+  bb.1.if.end.i:
+    liveins: $x10
+
+    BNE $x0, $x0, %bb.3
+    PseudoBR %bb.2
+
+  bb.2.if.then12.i:
+    liveins: $x10
+
+    SB $x0, %stack.0, 31 :: (store (s8) into %stack.0 + 31)
+    SB $x0, %stack.0, 30 :: (store (s8) into %stack.0 + 30)
+    SB $x0, %stack.0, 29 :: (store (s8) into %stack.0 + 29)
+    SB $x0, %stack.0, 28 :: (store (s8) into %stack.0 + 28)
+    SB $x0, %stack.0, 27 :: (store (s8) into %stack.0 + 27)
+    SB $x0, %stack.0, 26 :: (store (s8) into %stack.0 + 26)
+    SB $x0, %stack.0, 25 :: (store (s8) into %stack.0 + 25)
+    SB $x0, %stack.0, 24 :: (store (s8) into %stack.0 + 24)
+    SB $x0, %stack.0, 23 :: (store (s8) into %stack.0 + 23)
+    SB $x0, %stack.0, 22 :: (store (s8) into %stack.0 + 22)
+    SB $x0, %stack.0, 21 :: (store (s8) into %stack.0 + 21)
+    SB $x0, %stack.0, 20 :: (store (s8) into %stack.0 + 20)
+    SB $x0, %stack.0, 19 :: (store (s8) into %stack.0 + 19)
+    SB $x0, %stack.0, 18 :: (store (s8) into %stack.0 + 18)
+    SB $x0, %stack.0, 17 :: (store (s8) into %stack.0 + 17)
+    SB $x0, %stack.0, 16 :: (store (s8) into %stack.0 + 16)
+    SB renamable $x10, %stack.0, 0 :: (store (s8) into %stack.0)
+    SB renamable $x10, %stack.0, 4 :: (store (s8) into %stack.0 + 4)
+    renamable $x11 = SRLI renamable $x10, 24
+    SB renamable $x11, %stack.0, 3 :: (store (s8) into %stack.0 + 3)
+    renamable $x12 = SRLI renamable $x10, 16
+    SB renamable $x12, %stack.0, 2 :: (store (s8) into %stack.0 + 2)
+    renamable $x13 = SRLI renamable $x10, 8
+    SB renamable $x13, %stack.0, 1 :: (store (s8) into %stack.0 + 1)
+    SB renamable $x10, %stack.0, 8 :: (store (s8) into %stack.0 + 8)
+    SB renamable $x11, %stack.0, 7 :: (store (s8) into %stack.0 + 7)
+    SB renamable $x12, %stack.0, 6 :: (store (s8) into %stack.0 + 6)
+    SB renamable $x13, %stack.0, 5 :: (store (s8) into %stack.0 + 5)
+    SB killed renamable $x10, %stack.0, 12 :: (store (s8) into %stack.0 + 12)
+    SB renamable $x11, %stack.0, 11 :: (store (s8) into %stack.0 + 11)
+    SB renamable $x12, %stack.0, 10 :: (store (s8) into %stack.0 + 10)
+    SB renamable $x13, %stack.0, 9 :: (store (s8) into %stack.0 + 9)
+    SB killed renamable $x11, %stack.0, 15 :: (store (s8) into %stack.0 + 15)
+    SB killed renamable $x12, %stack.0, 14 :: (store (s8) into %stack.0 + 14)
+    SB killed renamable $x13, %stack.0, 13 :: (store (s8) into %stack.0 + 13)
+
+  bb.3.exit:
+    PseudoRET
+
+...

From 4a999da2142229f0d2a3598331ce55a1323929bd Mon Sep 17 00:00:00 2001
From: Yuxuan Shui <yshuiv7@gmail.com>
Date: Wed, 30 Aug 2023 15:40:56 +0300
Subject: [PATCH 69/92] [lldb][windows] _wsopen_s does not accept bits other
 than `_S_IREAD | _S_IWRITE`

When sending file from a Linux host to a Windows remote, Linux host will try to copy the source file's permission bits, which will contain `_S_I?GRP` and `_S_I?OTH` bits. Those bits are rejected by `_wsopen_s`, causing it to return EINVAL.

This patch masks out the rejected bits.

GitHub issue: #64313

Reviewed By: jasonmolenda, DavidSpickett

Differential Revision: https://reviews.llvm.org/D156817

(cherry picked from commit 9a4b3fdb82327e808213070fd157be3c557b8b9d)
---
 lldb/source/Host/windows/FileSystem.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Host/windows/FileSystem.cpp b/lldb/source/Host/windows/FileSystem.cpp
index b919d9bcd9dd4..4b0cd74b8013b 100644
--- a/lldb/source/Host/windows/FileSystem.cpp
+++ b/lldb/source/Host/windows/FileSystem.cpp
@@ -101,6 +101,8 @@ int FileSystem::Open(const char *path, int flags, int mode) {
   std::wstring wpath;
   if (!llvm::ConvertUTF8toWide(path, wpath))
     return -1;
+  // All other bits are rejected by _wsopen_s
+  mode = mode & (_S_IREAD | _S_IWRITE);
   int result;
   ::_wsopen_s(&result, wpath.c_str(), flags, _SH_DENYNO, mode);
   return result;

From 078e20cade5518d7df48c75ac25c3179f825f70a Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 26 Aug 2023 19:25:45 +0200
Subject: [PATCH 70/92] [libc++][format] Fixes out of bounds access.

Fixes https://llvm.org/PR65011

Reviewed By: #libc, ldionne

Differential Revision: https://reviews.llvm.org/D158940

(cherry picked from commit 8930d04d5580c6a2cf04545c87387cd150cd7b46)
---
 libcxx/include/__format/format_functions.h            |  3 +++
 .../utilities/format/format.functions/format_tests.h  |  5 +++++
 .../format/format.functions/vformat.pass.cpp          | 11 +++++++++++
 3 files changed, 19 insertions(+)

diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 27ec0a295f4f4..bb62c1ce10c15 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -245,6 +245,9 @@ __handle_replacement_field(_Iterator __begin, _Iterator __end,
   using _CharT = iter_value_t<_Iterator>;
   __format::__parse_number_result __r = __format::__parse_arg_id(__begin, __end, __parse_ctx);
 
+  if (__r.__last == __end)
+    std::__throw_format_error("The argument index should end with a ':' or a '}'");
+
   bool __parse = *__r.__last == _CharT(':');
   switch (*__r.__last) {
   case _CharT(':'):
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index 7a9cdaab7e93e..0a5c6649240d6 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -3145,8 +3145,13 @@ void format_tests(TestFunction check, ExceptionTest check_exception) {
 
   // *** Test invalid format strings ***
   check_exception("The format string terminates at a '{'", SV("{"));
+  check_exception("The argument index value is too large for the number of arguments supplied", SV("{:"));
   check_exception("The replacement field misses a terminating '}'", SV("{:"), 42);
 
+  check_exception("The argument index should end with a ':' or a '}'", SV("{0"));
+  check_exception("The argument index value is too large for the number of arguments supplied", SV("{0:"));
+  check_exception("The replacement field misses a terminating '}'", SV("{0:"), 42);
+
   check_exception("The format string contains an invalid escape sequence", SV("}"));
   check_exception("The format string contains an invalid escape sequence", SV("{:}-}"), 42);
 
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
index 6943ddc2f968e..e16d50f18284f 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -50,6 +50,17 @@ auto test_exception =
     };
 
 int main(int, char**) {
+#if !defined(TEST_HAS_NO_EXCEPTIONS)
+  // reproducer of https://llvm.org/PR65011
+  try {
+    const char fmt[] = {'{', '0'};
+    char buf[4096];
+    [[maybe_unused]] auto ignored =
+        std::vformat_to(buf, std::string_view{fmt, fmt + sizeof(fmt)}, std::make_format_args());
+  } catch (...) {
+  }
+#endif // !defined(TEST_HAS_NO_EXCEPTIONS)
+
   format_tests<char, execution_modus::full>(test, test_exception);
 
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS

From 844f1e5a026c43ffc556679dcec767ece704a40d Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Mon, 21 Aug 2023 08:43:21 +0100
Subject: [PATCH 71/92] [AArch64] Add Defs=[NZCV] to MTE loop pseudos.

The `STGloop` family of pseudo-instructions all expand to a loop which
iterates over a region of memory setting all its MTE tags to a given
value. The loop writes to the flags in order to check termination. But
the unexpanded pseudo-instructions were not marked as modifying the
flags. Therefore it was possible for one to end up in a location where
the flags were live, and then the loop would corrupt them.

We spotted the effect of this in a libc++ test involving a lot of
complicated inlining, and haven't been able to construct a smaller
test case that demonstrates actual incorrect output code. So my test
here is just checking that `implicit-def $nzcv` shows up on the
pseudo-instructions as they're output from isel.

Reviewed By: DavidSpickett

Differential Revision: https://reviews.llvm.org/D158262

(cherry picked from commit b09c575975b691e988a0f2e31d632c5f1038ab1d)
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  2 +-
 llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll | 59 +++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0f3d346176780..9e72d37880c58 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2253,7 +2253,7 @@ def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
 // Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
 // $Rn_wback is one past the end of the range. $Rm is the loop counter.
-let isCodeGenOnly=1, mayStore=1 in {
+let isCodeGenOnly=1, mayStore=1, Defs=[NZCV] in {
 def STGloop_wback
     : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
              [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
diff --git a/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll b/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll
new file mode 100644
index 0000000000000..86bafd1c93bc1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memtag-loop-nzcv.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O2 -print-after-isel -mtriple=aarch64-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=CHECK
+
+; This test function includes a 256-byte buffer. We expect it to require its
+; MTE tags to be set to a useful value on entry, and cleared again on exit. At
+; the time of writing this test, the pseudo-instructions chosen are
+; STGloop_wback and STGloop respectively, but if different pseudos are selected
+; in future, that's not a problem. The important thing is that both should
+; include that implicit-def of $nzcv, because these pseudo-instructions will
+; expand into loops that use the flags for their termination tests.
+
+; CHECK: STGloop_wback 256, {{.*}}, implicit-def dead $nzcv
+; CHECK: STGloop       256, {{.*}}, implicit-def dead $nzcv
+
+define i32 @foo(i32 noundef %0) #0 {
+  %2 = alloca i32, align 4
+  %3 = alloca [256 x i8], align 1
+  %4 = alloca i64, align 8
+  %5 = alloca i32, align 4
+  %6 = alloca i64, align 8
+  store i32 %0, ptr %2, align 4
+  %7 = load i32, ptr %2, align 4
+  %8 = getelementptr inbounds [256 x i8], ptr %3, i64 0, i64 0
+  %9 = call i64 @read(i32 noundef %7, ptr noundef %8, i64 noundef 256)
+  store i64 %9, ptr %4, align 8
+  store i32 0, ptr %5, align 4
+  store i64 0, ptr %6, align 8
+  br label %10
+
+10:                                               ; preds = %21, %1
+  %11 = load i64, ptr %6, align 8
+  %12 = load i64, ptr %4, align 8
+  %13 = icmp ult i64 %11, %12
+  br i1 %13, label %14, label %24
+
+14:                                               ; preds = %10
+  %15 = load i64, ptr %6, align 8
+  %16 = getelementptr inbounds [256 x i8], ptr %3, i64 0, i64 %15
+  %17 = load i8, ptr %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = load i32, ptr %5, align 4
+  %20 = add nsw i32 %19, %18
+  store i32 %20, ptr %5, align 4
+  br label %21
+
+21:                                               ; preds = %14
+  %22 = load i64, ptr %6, align 8
+  %23 = add i64 %22, 1
+  store i64 %23, ptr %6, align 8
+  br label %10
+
+24:                                               ; preds = %10
+  %25 = load i32, ptr %5, align 4
+  %26 = srem i32 %25, 251
+  ret i32 %26
+}
+
+declare i64 @read(i32 noundef, ptr noundef, i64 noundef)
+
+attributes #0 = { sanitize_memtag "target-features"="+mte" }

From 2b3336fa54af70d885b8d795d69337d5b05913e5 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1@ibm.com>
Date: Wed, 30 Aug 2023 11:54:12 -0500
Subject: [PATCH 72/92] [PowerPC][lld] Account for additional X-Forms ->
 D-Form/DS-Forms load/stores when relaxing initial-exec to local-exec

D153645 added additional X-Form load/stores that can be generated for TLS accesses.
However, these added instructions have not been accounted for in lld. As a result,
lld does not know how to handle them and cannot relax initial-exec to local-exec
when the initial-exec sequence contains these additional load/stores.

This patch aims to resolve https://github.com/llvm/llvm-project/issues/64424.

Differential Revision: https://reviews.llvm.org/D158197

(cherry picked from commit 698b45aa902de4d30c798e8d6bd080c8e31bade8)
---
 lld/ELF/Arch/PPC.cpp              |  12 ++-
 lld/ELF/Arch/PPC64.cpp            |  86 ++++++++++++++-----
 lld/ELF/Target.h                  |   1 +
 lld/test/ELF/ppc32-tls-ie.s       |  27 +++++-
 lld/test/ELF/ppc64-tls-ie.s       |  72 ++++++++++++++--
 lld/test/ELF/ppc64-tls-pcrel-ie.s | 132 +++++++++++++++++++++++++++---
 6 files changed, 287 insertions(+), 43 deletions(-)

diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index 87942c1e92452..3d21edb3453a1 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -471,10 +471,14 @@ void PPC::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
     if (insn >> 26 != 31)
       error("unrecognized instruction for IE to LE R_PPC_TLS");
     // addi rT, rT, x@tls --> addi rT, rT, x@tprel@l
-    uint32_t dFormOp = getPPCDFormOp((read32(loc) & 0x000007fe) >> 1);
-    if (dFormOp == 0)
-      error("unrecognized instruction for IE to LE R_PPC_TLS");
-    write32(loc, (dFormOp << 26) | (insn & 0x03ff0000) | lo(val));
+    unsigned secondaryOp = (read32(loc) & 0x000007fe) >> 1;
+    uint32_t dFormOp = getPPCDFormOp(secondaryOp);
+    if (dFormOp == 0) { // Expecting a DS-Form instruction.
+      dFormOp = getPPCDSFormOp(secondaryOp);
+      if (dFormOp == 0)
+        error("unrecognized instruction for IE to LE R_PPC_TLS");
+    }
+    write32(loc, (dFormOp | (insn & 0x03ff0000) | lo(val)));
     break;
   }
   default:
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 36b1d0e3c9be4..0b6459f852c0b 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -37,6 +37,12 @@ enum XFormOpcd {
   STHX = 407,
   STWX = 151,
   STDX = 149,
+  LHAX = 343,
+  LWAX = 341,
+  LFSX = 535,
+  LFDX = 599,
+  STFSX = 663,
+  STFDX = 727,
   ADD = 266,
 };
 
@@ -49,7 +55,6 @@ enum DFormOpcd {
   LWZ = 32,
   LWZU = 33,
   LFSU = 49,
-  LD = 58,
   LFDU = 51,
   STB = 38,
   STBU = 39,
@@ -59,10 +64,20 @@ enum DFormOpcd {
   STWU = 37,
   STFSU = 53,
   STFDU = 55,
-  STD = 62,
+  LHA = 42,
+  LFS = 48,
+  LFD = 50,
+  STFS = 52,
+  STFD = 54,
   ADDI = 14
 };
 
+enum DSFormOpcd {
+  LD = 58,
+  LWA = 58,
+  STD = 62
+};
+
 constexpr uint32_t NOP = 0x60000000;
 
 enum class PPCLegacyInsn : uint32_t {
@@ -825,26 +840,48 @@ void PPC64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
   }
 }
 
+// Map X-Form instructions to their DS-Form counterparts, if applicable.
+// The full encoding is returned here to distinguish between the different
+// DS-Form instructions.
+unsigned elf::getPPCDSFormOp(unsigned secondaryOp) {
+  switch (secondaryOp) {
+  case LWAX:
+    return (LWA << 26) | 0x2;
+  case LDX:
+    return LD << 26;
+  case STDX:
+    return STD << 26;
+  default:
+    return 0;
+  }
+}
+
 unsigned elf::getPPCDFormOp(unsigned secondaryOp) {
   switch (secondaryOp) {
   case LBZX:
-    return LBZ;
+    return LBZ << 26;
   case LHZX:
-    return LHZ;
+    return LHZ << 26;
   case LWZX:
-    return LWZ;
-  case LDX:
-    return LD;
+    return LWZ << 26;
   case STBX:
-    return STB;
+    return STB << 26;
   case STHX:
-    return STH;
+    return STH << 26;
   case STWX:
-    return STW;
-  case STDX:
-    return STD;
+    return STW << 26;
+  case LHAX:
+    return LHA << 26;
+  case LFSX:
+    return LFS << 26;
+  case LFDX:
+    return LFD << 26;
+  case STFSX:
+    return STFS << 26;
+  case STFDX:
+    return STFD << 26;
   case ADD:
-    return ADDI;
+    return ADDI << 26;
   default:
     return 0;
   }
@@ -898,10 +935,16 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         error("unrecognized instruction for IE to LE R_PPC64_TLS");
       uint32_t secondaryOp = (read32(loc) & 0x000007FE) >> 1; // bits 21-30
       uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-      if (dFormOp == 0)
-        error("unrecognized instruction for IE to LE R_PPC64_TLS");
-      write32(loc, ((dFormOp << 26) | (read32(loc) & 0x03FFFFFF)));
-      relocateNoSym(loc + offset, R_PPC64_TPREL16_LO, val);
+      uint32_t finalReloc;
+      if (dFormOp == 0) { // Expecting a DS-Form instruction.
+        dFormOp = getPPCDSFormOp(secondaryOp);
+        if (dFormOp == 0)
+          error("unrecognized instruction for IE to LE R_PPC64_TLS");
+        finalReloc = R_PPC64_TPREL16_LO_DS;
+      } else
+        finalReloc = R_PPC64_TPREL16_LO;
+      write32(loc, dFormOp | (read32(loc) & 0x03ff0000));
+      relocateNoSym(loc + offset, finalReloc, val);
     } else if (locAsInt % 4 == 1) {
       // If the offset is not 4 byte aligned then we have a PCRel type reloc.
       // This version of the relocation is offset by one byte from the
@@ -926,9 +969,12 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
         }
       } else {
         uint32_t dFormOp = getPPCDFormOp(secondaryOp);
-        if (dFormOp == 0)
-          errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
-        write32(loc - 1, ((dFormOp << 26) | (tlsInstr & 0x03FF0000)));
+        if (dFormOp == 0) { // Expecting a DS-Form instruction.
+          dFormOp = getPPCDSFormOp(secondaryOp);
+          if (dFormOp == 0)
+            errorOrWarn("unrecognized instruction for IE to LE R_PPC64_TLS");
+        }
+        write32(loc - 1, (dFormOp | (tlsInstr & 0x03ff0000)));
       }
     } else {
       errorOrWarn("R_PPC64_TLS must be either 4 byte aligned or one byte "
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 9d4f22dd93f1b..47dbe6b4d1c65 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -207,6 +207,7 @@ void processArmCmseSymbols();
 void writePPC32GlinkSection(uint8_t *buf, size_t numEntries);
 
 unsigned getPPCDFormOp(unsigned secondaryOp);
+unsigned getPPCDSFormOp(unsigned secondaryOp);
 
 // In the PowerPC64 Elf V2 abi a function can have 2 entry points.  The first
 // is a global entry point (GEP) which typically is used to initialize the TOC
diff --git a/lld/test/ELF/ppc32-tls-ie.s b/lld/test/ELF/ppc32-tls-ie.s
index f9f46452484a6..84a105c8626b3 100644
--- a/lld/test/ELF/ppc32-tls-ie.s
+++ b/lld/test/ELF/ppc32-tls-ie.s
@@ -12,8 +12,8 @@
 # IE-REL:      FLAGS STATIC_TLS
 ## A non-preemptable symbol (b) has 0 st_shndx.
 # IE-REL:      .rela.dyn {
-# IE-REL-NEXT:   0x20238 R_PPC_TPREL32 - 0xC
-# IE-REL-NEXT:   0x20234 R_PPC_TPREL32 a 0x0
+# IE-REL-NEXT:   0x20258 R_PPC_TPREL32 - 0xC
+# IE-REL-NEXT:   0x20254 R_PPC_TPREL32 a 0x0
 # IE-REL-NEXT: }
 
 ## &.got[3] - _GLOBAL_OFFSET_TABLE_ = 12
@@ -44,6 +44,12 @@ lbzx 10, 8, c@tls
 # IE-NEXT: stbx 14, 4, 2
 # IE-NEXT: sthx 15, 5, 2
 # IE-NEXT: stwx 16, 6, 2
+# IE-NEXT: lhax 17, 7, 2
+# IE-NEXT: lwax 18, 8, 2
+# IE-NEXT: lfsx 19, 9, 2
+# IE-NEXT: lfdx 20, 10, 2
+# IE-NEXT: stfsx 21, 11, 2
+# IE-NEXT: stfdx 22, 12, 2
 
 ## In LE, these X-Form instructions are changed to their corresponding D-Form.
 # LE-NEXT: lhz 12, -28660(2)
@@ -51,12 +57,26 @@ lbzx 10, 8, c@tls
 # LE-NEXT: stb 14, -28660(4)
 # LE-NEXT: sth 15, -28660(5)
 # LE-NEXT: stw 16, -28660(6)
+# LE-NEXT: lha 17, -28660(7)
+# LE-NEXT: lwa 18, -28660(8)
+# LE-NEXT: lfs 19, -28660(9)
+# LE-NEXT: lfd 20, -28660(10)
+# LE-NEXT: stfs 21, -28660(11)
+# LE-NEXT: stfd 22, -28660(12)
 
 lhzx 12, 2, s@tls
 lwzx 13, 3, i@tls
 stbx 14, 4, c@tls
 sthx 15, 5, s@tls
 stwx 16, 6, i@tls
+lhax 17, 7, s@tls
+lwax 18, 8, i@tls
+lfsx 19, 9, f@tls
+lfdx 20, 10, d@tls
+stfsx 21, 11, f@tls
+stfdx 22, 12, d@tls
+ldx 23, 13, l@tls
+stdx 24, 14, l@tls
 
 .section .tbss
 .globl a
@@ -66,3 +86,6 @@ a:
 c:
 s:
 i:
+f:
+d:
+l:
diff --git a/lld/test/ELF/ppc64-tls-ie.s b/lld/test/ELF/ppc64-tls-ie.s
index 8da808b86c30b..8855e8c012399 100644
--- a/lld/test/ELF/ppc64-tls-ie.s
+++ b/lld/test/ELF/ppc64-tls-ie.s
@@ -24,10 +24,12 @@
 
 # IE-REL:      FLAGS STATIC_TLS
 # IE-REL:      .rela.dyn {
-# IE-REL-NEXT:   0x204C8 R_PPC64_TPREL64 c 0x0
-# IE-REL-NEXT:   0x204D0 R_PPC64_TPREL64 s 0x0
-# IE-REL-NEXT:   0x204D8 R_PPC64_TPREL64 i 0x0
-# IE-REL-NEXT:   0x204E0 R_PPC64_TPREL64 l 0x0
+# IE-REL-NEXT:   0x205A8 R_PPC64_TPREL64 c 0x0
+# IE-REL-NEXT:   0x205B0 R_PPC64_TPREL64 s 0x0
+# IE-REL-NEXT:   0x205B8 R_PPC64_TPREL64 i 0x0
+# IE-REL-NEXT:   0x205C0 R_PPC64_TPREL64 l 0x0
+# IE-REL-NEXT:   0x205C8 R_PPC64_TPREL64 f 0x0
+# IE-REL-NEXT:   0x205D0 R_PPC64_TPREL64 d 0x0
 # IE-REL-NEXT: }
 
 # INPUT-REL: R_PPC64_GOT_TPREL16_HA c 0x0
@@ -152,10 +154,64 @@ test_ds:
   ld 4, l@got@tprel(2)
   stdx 3, 4, l@tls
 
+# LE-LABEL: <test_lhax>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lha 3, -28670(3)
+test_lhax:
+  addis 3, 2, s@got@tprel@ha
+  ld 3, s@got@tprel@l(3)
+  lhax 3, 3, s@tls
+
+# LE-LABEL: <test_lwax>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lwa 3, -28668(3)
+test_lwax:
+  addis 3, 2, i@got@tprel@ha
+  ld 3, i@got@tprel@l(3)
+  lwax 3, 3, i@tls
+
+# LE-LABEL: <test_lfsx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lfs 3, -28656(3)
+test_lfsx:
+  addis 3, 2, f@got@tprel@ha
+  ld 3, f@got@tprel@l(3)
+  lfsx 3, 3, f@tls
+
+# LE-LABEL: <test_lfdx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 3, 13, 0
+# LE-NEXT:    lfd 3, -28648(3)
+test_lfdx:
+  addis 3, 2, d@got@tprel@ha
+  ld 3, d@got@tprel@l(3)
+  lfdx 3, 3, d@tls
+
+# LE-LABEL: <test_stfsx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 4, 13, 0
+# LE-NEXT:    stfs 3, -28656(4)
+test_stfsx:
+  addis 4, 2, f@got@tprel@ha
+  ld 4, f@got@tprel@l(4)
+  stfsx 3, 4, f@tls
+
+# LE-LABEL: <test_stfdx>:
+# LE-NEXT:    nop
+# LE-NEXT:    addis 4, 13, 0
+# LE-NEXT:    stfd 3, -28648(4)
+test_stfdx:
+  addis 4, 2, d@got@tprel@ha
+  ld 4, d@got@tprel@l(4)
+  stfdx 3, 4, d@tls
+
 # NOREL: There are no relocations in this file.
 
 .section .tdata,"awT",@progbits
-.globl c, s, i, l
+.globl c, s, i, l, f, d
 c:
 .byte 97
 
@@ -170,3 +226,9 @@ i:
 .p2align 3
 l:
 .quad 55
+f:
+.long 55
+
+.p2align 3
+d:
+.quad 55
diff --git a/lld/test/ELF/ppc64-tls-pcrel-ie.s b/lld/test/ELF/ppc64-tls-pcrel-ie.s
index f7a828dc41744..38c081f966469 100644
--- a/lld/test/ELF/ppc64-tls-pcrel-ie.s
+++ b/lld/test/ELF/ppc64-tls-pcrel-ie.s
@@ -29,6 +29,12 @@ SECTIONS {
   .text_val 0x1002000 : { *(.text_val) }
   .text_twoval 0x1003000 : { *(.text_twoval) }
   .text_incrval 0x1004000 : { *(.text_incrval) }
+  .text_incrval_half 0x1005000 : { *(.text_incrval_half) }
+  .text_incrval_word 0x1006000 : { *(.text_incrval_word) }
+  .text_incrval_float 0x1007000 : { *(.text_incrval_float) }
+  .text_incrval_double 0x1008000 : { *(.text_incrval_double) }
+  .text_incrval_dword 0x1009000 : { *(.text_incrval_dword) }
+  .text_incrval_half_zero 0x1010000 : { *(.text_incrval_half_zero) }
 }
 
 #--- defs
@@ -42,26 +48,26 @@ y:
 
 #--- asm
 # IE-RELOC: Relocation section '.rela.dyn' at offset 0x10090 contains 2 entries:
-# IE-RELOC: 00000000010040f0  0000000100000049 R_PPC64_TPREL64        0000000000000000 x + 0
-# IE-RELOC: 00000000010040f8  0000000200000049 R_PPC64_TPREL64        0000000000000000 y + 0
+# IE-RELOC: 00000000010100f0  0000000100000049 R_PPC64_TPREL64        0000000000000000 x + 0
+# IE-RELOC-NEXT: 00000000010100f8  0000000200000049 R_PPC64_TPREL64        0000000000000000 y + 0
 
 # IE-SYM:   Symbol table '.dynsym' contains 3 entries:
 # IE-SYM:   1: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND x
 # IE-SYM:   2: 0000000000000000     0 TLS     GLOBAL DEFAULT   UND y
 
 # IE-GOT:      Hex dump of section '.got':
-# IE-GOT-NEXT: 0x010040e8 e8c00001 00000000 00000000 00000000
+# IE-GOT-NEXT: 0x010100e8 e8800101 00000000 00000000 00000000
 
 # LE-RELOC: There are no relocations in this file.
 
-# LE-SYM: Symbol table '.symtab' contains 8 entries:
-# LE-SYM: 6: 0000000000000000     0 TLS     GLOBAL DEFAULT     6 x
-# LE-SYM: 7: 0000000000000004     0 TLS     GLOBAL DEFAULT     6 y
+# LE-SYM: Symbol table '.symtab' contains 14 entries:
+# LE-SYM: 0000000000000000     0 TLS     GLOBAL DEFAULT     [[#]] x
+# LE-SYM: 0000000000000004     0 TLS     GLOBAL DEFAULT     [[#]] y
 
 # LE-GOT: could not find section '.got'
 
 # IE-LABEL: <IEAddr>:
-# IE-NEXT:    pld 3, 12528(0), 1
+# IE-NEXT:    pld 3, 61680(0), 1
 # IE-NEXT:    add 3, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEAddr>:
@@ -75,7 +81,7 @@ IEAddr:
 	blr
 
 # IE-LABEL: <IEAddrCopy>:
-# IE-NEXT:    pld 3, 12512(0), 1
+# IE-NEXT:    pld 3, 61664(0), 1
 # IE-NEXT:    add 4, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEAddrCopy>:
@@ -89,7 +95,7 @@ IEAddrCopy:
 	blr
 
 # IE-LABEL: <IEVal>:
-# IE-NEXT:    pld 3, 8432(0), 1
+# IE-NEXT:    pld 3, 57584(0), 1
 # IE-NEXT:    lwzx 3, 3, 13
 # IE-NEXT:    blr
 # LE-LABEL: <IEVal>:
@@ -103,8 +109,8 @@ IEVal:
 	blr
 
 # IE-LABEL: <IETwoVal>:
-# IE-NEXT:    pld 3, 4336(0), 1
-# IE-NEXT:    pld 4, 4336(0), 1
+# IE-NEXT:    pld 3, 53488(0), 1
+# IE-NEXT:    pld 4, 53488(0), 1
 # IE-NEXT:    lwzx 3, 3, 13
 # IE-NEXT:    lwzx 4, 4, 13
 # IE-NEXT:    blr
@@ -123,7 +129,7 @@ IETwoVal:
 	blr
 
 # IE-LABEL: <IEIncrementVal>:
-# IE-NEXT:    pld 4, 248(0), 1
+# IE-NEXT:    pld 4, 49400(0), 1
 # IE-NEXT:    lwzx 3, 4, 13
 # IE-NEXT:    stwx 3, 4, 13
 # IE-NEXT:    blr
@@ -138,3 +144,105 @@ IEIncrementVal:
 	lwzx 3, 4, y@tls@pcrel
 	stwx 3, 4, y@tls@pcrel
 	blr
+
+# IE-LABEL: <IEIncrementValHalf>:
+# IE-NEXT:    pld 4, 45304(0), 1
+# IE-NEXT:    lhax 3, 4, 13
+# IE-NEXT:    sthx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValHalf>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lha 3, 0(4)
+# LE-NEXT:    sth 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_half, "ax", %progbits
+IEIncrementValHalf:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lhax 3, 4, y@tls@pcrel
+	sthx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValWord>:
+# IE-NEXT:    pld 4, 41208(0), 1
+# IE-NEXT:    lwax 3, 4, 13
+# IE-NEXT:    stwx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValWord>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lwa 3, 0(4)
+# LE-NEXT:    stw 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_word, "ax", %progbits
+IEIncrementValWord:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lwax 3, 4, y@tls@pcrel
+	stwx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValFloat>:
+# IE-NEXT:    pld 4, 37112(0), 1
+# IE-NEXT:    lfsx 3, 4, 13
+# IE-NEXT:    stfsx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValFloat>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lfs 3, 0(4)
+# LE-NEXT:    stfs 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_float, "ax", %progbits
+IEIncrementValFloat:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lfsx 3, 4, y@tls@pcrel
+	stfsx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValDouble>:
+# IE-NEXT:    pld 4, 33016(0), 1
+# IE-NEXT:    lfdx 3, 4, 13
+# IE-NEXT:    stfdx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValDouble>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lfd 3, 0(4)
+# LE-NEXT:    stfd 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_double, "ax", %progbits
+IEIncrementValDouble:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lfdx 3, 4, y@tls@pcrel
+	stfdx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValDword>:
+# IE-NEXT:    pld 4, 28920(0), 1
+# IE-NEXT:    ldx 3, 4, 13
+# IE-NEXT:    stdx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValDword>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    ld 3, 0(4)
+# LE-NEXT:    std 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_dword, "ax", %progbits
+IEIncrementValDword:
+	pld 4, y@got@tprel@pcrel(0), 1
+	ldx 3, 4, y@tls@pcrel
+	stdx 3, 4, y@tls@pcrel
+	blr
+
+# IE-LABEL: <IEIncrementValHalfZero>:
+# IE-NEXT:    pld 4, 248(0), 1
+# IE-NEXT:    lhzx 3, 4, 13
+# IE-NEXT:    sthx 3, 4, 13
+# IE-NEXT:    blr
+# LE-LABEL: <IEIncrementValHalfZero>:
+# LE-NEXT:    paddi 4, 13, -28668, 0
+# LE-NEXT:    lhz 3, 0(4)
+# LE-NEXT:    sth 3, 0(4)
+# LE-NEXT:    blr
+.section .text_incrval_half_zero, "ax", %progbits
+IEIncrementValHalfZero:
+	pld 4, y@got@tprel@pcrel(0), 1
+	lhzx 3, 4, y@tls@pcrel
+	sthx 3, 4, y@tls@pcrel
+	blr

From b879e0f5c402dd8f0e18e8c5ea64774a407390c6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 28 Aug 2023 10:11:12 -0700
Subject: [PATCH 73/92] [RISCV] Prevent tryToFoldBNEOnCmpXchgResult from
 deleting AND if it has others users.

This disables the transform if the branch does not have the kill
flag set for the AND we want to delete.

Ideally we'd be able to share the AND with the AND we create in
the expansion, but that's a more complex transform. So this starts
with the simple approach to fix miscompile.

This should be backported to LLVM 17.

Fixes PR65025.ll

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D158962

(cherry picked from commit ff6d33382faf3709fa270ae0abb8d165142df9ae)
---
 .../RISCV/RISCVExpandAtomicPseudoInsts.cpp    |  9 ++++
 llvm/test/CodeGen/RISCV/pr65025.ll            | 48 +++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/pr65025.ll

diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 59f1e8319ae72..d10bba26023ff 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -572,6 +572,15 @@ bool tryToFoldBNEOnCmpXchgResult(MachineBasicBlock &MBB,
   if (!(BNEOp0 == DestReg && BNEOp1 == CmpValReg) &&
       !(BNEOp0 == CmpValReg && BNEOp1 == DestReg))
     return false;
+
+  // Make sure the branch is the only user of the AND.
+  if (MaskReg.isValid()) {
+    if (BNEOp0 == DestReg && !MBBI->getOperand(0).isKill())
+      return false;
+    if (BNEOp1 == DestReg && !MBBI->getOperand(1).isKill())
+      return false;
+  }
+
   ToErase.push_back(&*MBBI);
   LoopHeadBNETarget = MBBI->getOperand(2).getMBB();
   MBBI = skipDebugInstructionsForward(std::next(MBBI), E);
diff --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll
new file mode 100644
index 0000000000000..dcd71edc460b8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr65025.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=riscv64 -mattr=+a | FileCheck %s
+
+define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %val) nounwind {
+; CHECK-LABEL: cmpxchg_masked_and_branch1:
+; CHECK:       # %bb.0: # %do_cmpxchg
+; CHECK-NEXT:    andi a3, a0, -4
+; CHECK-NEXT:    slli a4, a0, 3
+; CHECK-NEXT:    li a5, 255
+; CHECK-NEXT:    sllw a5, a5, a4
+; CHECK-NEXT:    andi a1, a1, 255
+; CHECK-NEXT:    sllw a1, a1, a4
+; CHECK-NEXT:    andi a2, a2, 255
+; CHECK-NEXT:    sllw a2, a2, a4
+; CHECK-NEXT:  .LBB0_3: # %do_cmpxchg
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lr.w.aqrl a4, (a3)
+; CHECK-NEXT:    and a6, a4, a5
+; CHECK-NEXT:    bne a6, a1, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %do_cmpxchg
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    xor a6, a4, a2
+; CHECK-NEXT:    and a6, a6, a5
+; CHECK-NEXT:    xor a6, a4, a6
+; CHECK-NEXT:    sc.w.rl a6, a6, (a3)
+; CHECK-NEXT:    bnez a6, .LBB0_3
+; CHECK-NEXT:  .LBB0_5: # %do_cmpxchg
+; CHECK-NEXT:    and a2, a4, a5
+; CHECK-NEXT:    bne a1, a2, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %returnptr
+; CHECK-NEXT:    xor a1, a1, a2
+; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %exit
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+do_cmpxchg:
+  %0 = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst seq_cst
+  %1 = extractvalue { i8, i1 } %0, 1
+  %2 = select i1 %1, ptr %ptr, ptr null
+  br i1 %1, label %returnptr, label %exit
+returnptr:
+  ret ptr %2
+exit:
+  ret ptr null
+}

From dc2d2f9ba1bc3e72d011c6203eb986e8d0f5ca32 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 31 Jul 2023 09:20:55 +0200
Subject: [PATCH 74/92] [clangd] Respect IWYU keep pragma for standard headers.

see the issue https://github.com/llvm/llvm-project/issues/64191

Differential Revision: https://reviews.llvm.org/D156650

(cherry picked from commit dcb28244faa88cb566a852533790bcac75daaa0f)
---
 clang-tools-extra/clangd/IncludeCleaner.cpp                | 4 ++--
 clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 9708c67ca2883..b2c04ac4d5463 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -70,6 +70,8 @@ bool isIgnored(llvm::StringRef HeaderPath, HeaderFilter IgnoreHeaders) {
 bool mayConsiderUnused(
     const Inclusion &Inc, ParsedAST &AST,
     const include_cleaner::PragmaIncludes *PI) {
+  if (PI && PI->shouldKeep(Inc.HashLine + 1))
+      return false;
   // FIXME(kirillbobyrev): We currently do not support the umbrella headers.
   // System headers are likely to be standard library headers.
   // Until we have good support for umbrella headers, don't warn about them.
@@ -81,8 +83,6 @@ bool mayConsiderUnused(
       AST.getIncludeStructure().getRealPath(HID));
   assert(FE);
   if (PI) {
-    if (PI->shouldKeep(Inc.HashLine + 1))
-      return false;
     // Check if main file is the public interface for a private header. If so we
     // shouldn't diagnose it as unused.
     if (auto PHeader = PI->getPublic(*FE); !PHeader.empty()) {
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index c55351fb1f91d..83a7c45df1695 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -76,6 +76,8 @@ TEST(IncludeCleaner, StdlibUnused) {
   auto TU = TestTU::withCode(R"cpp(
     #include <list>
     #include <queue>
+    #include <vector> // IWYU pragma: keep
+    #include <string> // IWYU pragma: export
     std::list<int> x;
   )cpp");
   // Layout of std library impl is not relevant.
@@ -84,10 +86,13 @@ TEST(IncludeCleaner, StdlibUnused) {
     namespace std {
       template <typename> class list {};
       template <typename> class queue {};
+      template <typename> class vector {};
     }
   )cpp";
   TU.AdditionalFiles["list"] = "#include <bits>";
   TU.AdditionalFiles["queue"] = "#include <bits>";
+  TU.AdditionalFiles["vector"] = "#include <bits>";
+  TU.AdditionalFiles["string"] = "#include <bits>";
   TU.ExtraArgs = {"-isystem", testRoot()};
   auto AST = TU.build();
   IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST);

From a0de0b440fa606805749dc7469a1b7dcae851eba Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Tue, 29 Aug 2023 08:59:24 +0800
Subject: [PATCH 75/92] [X86][BF16] Add test coverage for AVX-NE-CONVERT

Split from D158952.

(cherry picked from commit 30ec9473c6685d64d5caa17e2a6e8f4ccf275159)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |   3 +-
 llvm/test/CodeGen/X86/bfloat.ll         | 768 ++++++++++++++++++++----
 2 files changed, 664 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ae9012055bbb9..7bcc181e2ca91 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11360,7 +11360,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
 
-  if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index dff4864537bfd..6798adaf1e5f2 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,BF16
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,FP16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
 
 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-LABEL: add:
@@ -21,22 +22,22 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rbx
-; F16-NEXT:    movq %rdx, %rbx
-; F16-NEXT:    movzwl (%rsi), %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    movzwl (%rdi), %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm1
-; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    movw %ax, (%rbx)
-; F16-NEXT:    popq %rbx
-; F16-NEXT:    retq
+; AVX-LABEL: add:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    movzwl (%rsi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = load bfloat, ptr %pa
   %b = load bfloat, ptr %pb
   %add = fadd bfloat %a, %b
@@ -59,19 +60,19 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add2:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rax
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    vmovd %xmm1, %ecx
-; F16-NEXT:    shll $16, %ecx
-; F16-NEXT:    vmovd %ecx, %xmm0
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm1
-; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    popq %rax
-; F16-NEXT:    retq
+; AVX-LABEL: add2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovd %xmm1, %ecx
+; AVX-NEXT:    shll $16, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    retq
   %add = fadd bfloat %a, %b
   ret bfloat %add
 }
@@ -106,34 +107,34 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbp
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add_double:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rbp
-; F16-NEXT:    pushq %r14
-; F16-NEXT:    pushq %rbx
-; F16-NEXT:    movq %rdx, %rbx
-; F16-NEXT:    movq %rsi, %r14
-; F16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; F16-NEXT:    callq __truncdfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %ebp
-; F16-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; F16-NEXT:    callq __truncdfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    shll $16, %ebp
-; F16-NEXT:    vmovd %ebp, %xmm1
-; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; F16-NEXT:    vmovsd %xmm0, (%rbx)
-; F16-NEXT:    popq %rbx
-; F16-NEXT:    popq %r14
-; F16-NEXT:    popq %rbp
-; F16-NEXT:    retq
+; AVX-LABEL: add_double:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    movq %rsi, %r14
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %ebp
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    shll $16, %ebp
+; AVX-NEXT:    vmovd %ebp, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd %xmm0, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    retq
   %la = load double, ptr %pa
   %a = fptrunc double %la to bfloat
   %lb = load double, ptr %pb
@@ -170,30 +171,30 @@ define double @add_double2(double %da, double %db) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add_double2:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rbx
-; F16-NEXT:    subq $16, %rsp
-; F16-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; F16-NEXT:    callq __truncdfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %ebx
-; F16-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; F16-NEXT:    # xmm0 = mem[0],zero
-; F16-NEXT:    callq __truncdfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    shll $16, %ebx
-; F16-NEXT:    vmovd %ebx, %xmm1
-; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; F16-NEXT:    addq $16, %rsp
-; F16-NEXT:    popq %rbx
-; F16-NEXT:    retq
+; AVX-LABEL: add_double2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %ebx
+; AVX-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero
+; AVX-NEXT:    callq __truncdfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    shll $16, %ebx
+; AVX-NEXT:    vmovd %ebx, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    addq $16, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = fptrunc double %da to bfloat
   %b = fptrunc double %db to bfloat
   %add = fadd bfloat %a, %b
@@ -216,19 +217,19 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add_constant:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rbx
-; F16-NEXT:    movq %rsi, %rbx
-; F16-NEXT:    movzwl (%rdi), %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    movw %ax, (%rbx)
-; F16-NEXT:    popq %rbx
-; F16-NEXT:    retq
+; AVX-LABEL: add_constant:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rsi, %rbx
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rbx)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
   %a = load bfloat, ptr %pa
   %add = fadd bfloat %a, 1.0
   store bfloat %add, ptr %pc
@@ -247,16 +248,16 @@ define bfloat @add_constant2(bfloat %a) nounwind {
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: add_constant2:
-; F16:       # %bb.0:
-; F16-NEXT:    pushq %rax
-; F16-NEXT:    vmovd %xmm0, %eax
-; F16-NEXT:    shll $16, %eax
-; F16-NEXT:    vmovd %eax, %xmm0
-; F16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16-NEXT:    callq __truncsfbf2@PLT
-; F16-NEXT:    popq %rax
-; F16-NEXT:    retq
+; AVX-LABEL: add_constant2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __truncsfbf2@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    retq
   %add = fadd bfloat %a, 1.0
   ret bfloat %add
 }
@@ -656,6 +657,120 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
 ; FP16-NEXT:    popq %r15
 ; FP16-NEXT:    popq %rbp
 ; FP16-NEXT:    retq
+;
+; AVXNC-LABEL: addv:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    pushq %r15
+; AVXNC-NEXT:    pushq %r14
+; AVXNC-NEXT:    pushq %r13
+; AVXNC-NEXT:    pushq %r12
+; AVXNC-NEXT:    pushq %rbx
+; AVXNC-NEXT:    subq $40, %rsp
+; AVXNC-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVXNC-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vpextrw $7, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm2, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $6, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %ebp
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $5, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r14d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $4, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r15d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $3, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r12d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $2, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %r13d
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $1, %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %ebx
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVXNC-NEXT:    vmovd %xmm1, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVXNC-NEXT:    callq __truncsfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    addq $40, %rsp
+; AVXNC-NEXT:    popq %rbx
+; AVXNC-NEXT:    popq %r12
+; AVXNC-NEXT:    popq %r13
+; AVXNC-NEXT:    popq %r14
+; AVXNC-NEXT:    popq %r15
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
   %add = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %add
 }
@@ -677,6 +792,19 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ; F16-NEXT:    vmovd %ecx, %xmm0
 ; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr62997:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vmovd %xmm1, %eax
+; AVXNC-NEXT:    vmovd %xmm0, %ecx
+; AVXNC-NEXT:    vmovd %ecx, %xmm0
+; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNC-NEXT:    shrl $16, %eax
+; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVXNC-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVXNC-NEXT:    retq
   %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
   %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
   ret <2 x bfloat> %2
@@ -695,6 +823,12 @@ define <32 x bfloat> @pr63017() {
 ; F16:       # %bb.0:
 ; F16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr63017:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    retq
   ret <32 x bfloat> zeroinitializer
 }
 
@@ -1270,6 +1404,256 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; F16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
 ; F16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr63017_2:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_1
+; AVXNC-NEXT:  # %bb.2: # %cond.load
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm0, %xmm0
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVXNC-NEXT:    jmp .LBB12_3
+; AVXNC-NEXT:  .LBB12_1:
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT:    vmovdqa %ymm0, %ymm1
+; AVXNC-NEXT:  .LBB12_3: # %else
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_5
+; AVXNC-NEXT:  # %bb.4: # %cond.load1
+; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_5: # %else2
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_7
+; AVXNC-NEXT:  # %bb.6: # %cond.load4
+; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_7: # %else5
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_9
+; AVXNC-NEXT:  # %bb.8: # %cond.load7
+; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_9: # %else8
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_11
+; AVXNC-NEXT:  # %bb.10: # %cond.load10
+; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_11: # %else11
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_13
+; AVXNC-NEXT:  # %bb.12: # %cond.load13
+; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_13: # %else14
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_15
+; AVXNC-NEXT:  # %bb.14: # %cond.load16
+; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_15: # %else17
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_17
+; AVXNC-NEXT:  # %bb.16: # %cond.load19
+; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_17: # %else20
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_19
+; AVXNC-NEXT:  # %bb.18: # %cond.load22
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_19: # %else23
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_21
+; AVXNC-NEXT:  # %bb.20: # %cond.load25
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_21: # %else26
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_23
+; AVXNC-NEXT:  # %bb.22: # %cond.load28
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_23: # %else29
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_25
+; AVXNC-NEXT:  # %bb.24: # %cond.load31
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_25: # %else32
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_27
+; AVXNC-NEXT:  # %bb.26: # %cond.load34
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_27: # %else35
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_29
+; AVXNC-NEXT:  # %bb.28: # %cond.load37
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_29: # %else38
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_31
+; AVXNC-NEXT:  # %bb.30: # %cond.load40
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_31: # %else41
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_33
+; AVXNC-NEXT:  # %bb.32: # %cond.load43
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_33: # %else44
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_35
+; AVXNC-NEXT:  # %bb.34: # %cond.load46
+; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_35: # %else47
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_37
+; AVXNC-NEXT:  # %bb.36: # %cond.load49
+; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_37: # %else50
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_39
+; AVXNC-NEXT:  # %bb.38: # %cond.load52
+; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_39: # %else53
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_41
+; AVXNC-NEXT:  # %bb.40: # %cond.load55
+; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_41: # %else56
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_43
+; AVXNC-NEXT:  # %bb.42: # %cond.load58
+; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_43: # %else59
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_45
+; AVXNC-NEXT:  # %bb.44: # %cond.load61
+; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_45: # %else62
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_47
+; AVXNC-NEXT:  # %bb.46: # %cond.load64
+; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_47: # %else65
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_49
+; AVXNC-NEXT:  # %bb.48: # %cond.load67
+; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm1, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_49: # %else68
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_51
+; AVXNC-NEXT:  # %bb.50: # %cond.load70
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_51: # %else71
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_53
+; AVXNC-NEXT:  # %bb.52: # %cond.load73
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_53: # %else74
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_55
+; AVXNC-NEXT:  # %bb.54: # %cond.load76
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_55: # %else77
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_57
+; AVXNC-NEXT:  # %bb.56: # %cond.load79
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_57: # %else80
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_59
+; AVXNC-NEXT:  # %bb.58: # %cond.load82
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_59: # %else83
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_61
+; AVXNC-NEXT:  # %bb.60: # %cond.load85
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_61: # %else86
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_63
+; AVXNC-NEXT:  # %bb.62: # %cond.load88
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_63: # %else89
+; AVXNC-NEXT:    xorl %eax, %eax
+; AVXNC-NEXT:    testb %al, %al
+; AVXNC-NEXT:    jne .LBB12_65
+; AVXNC-NEXT:  # %bb.64: # %cond.load91
+; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
+; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15]
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVXNC-NEXT:  .LBB12_65: # %else92
+; AVXNC-NEXT:    retq
   %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
   ret <32 x bfloat> %1
 }
@@ -1295,6 +1679,13 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
 ; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
 ; F16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr62997_3:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vmovd %xmm2, %eax
+; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm2
+; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVXNC-NEXT:    retq
   %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
   ret <32 x bfloat> %3
 }
@@ -1328,6 +1719,25 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) {
 ; F16-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; F16-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_1:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVXNC-NEXT:    retq
   %b = fpext <4 x bfloat> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -1377,6 +1787,41 @@ define <8 x float> @pr64460_2(<8 x bfloat> %a) {
 ; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; F16-NEXT:    vpslld $16, %ymm0, %ymm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_2:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVXNC-NEXT:    retq
   %b = fpext <8 x bfloat> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1461,6 +1906,74 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
 ; F16-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; F16-NEXT:    vpslld $16, %zmm0, %zmm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_3:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVXNC-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm4
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm4
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVXNC-NEXT:    vmovaps %ymm2, %ymm0
+; AVXNC-NEXT:    retq
   %b = fpext <16 x bfloat> %a to <16 x float>
   ret <16 x float> %b
 }
@@ -1517,6 +2030,49 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
 ; F16-NEXT:    vpslld $16, %ymm0, %ymm0
 ; F16-NEXT:    vcvtps2pd %ymm0, %zmm0
 ; F16-NEXT:    retq
+;
+; AVXNC-LABEL: pr64460_4:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm2
+; AVXNC-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm1
+; AVXNC-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm3
+; AVXNC-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
+; AVXNC-NEXT:    shll $16, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVXNC-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVXNC-NEXT:    vmovaps %ymm2, %ymm0
+; AVXNC-NEXT:    retq
   %b = fpext <8 x bfloat> %a to <8 x double>
   ret <8 x double> %b
 }

From e9eaf3dc64a62f3991c03f4176eb7f92034469a3 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Tue, 29 Aug 2023 09:31:00 +0800
Subject: [PATCH 76/92] [X86][BF16] Lower FP_ROUND for vector types under
 AVX512BF16

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D158952

(cherry picked from commit b667e9c23d6d2f1c7371eb4a03b30de4a6f8b7b6)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  14 +-
 llvm/lib/Target/X86/X86InstrAVX512.td         |  10 +
 llvm/lib/Target/X86/X86InstrSSE.td            |   5 +
 .../CodeGen/X86/avxneconvert-intrinsics.ll    |   2 -
 llvm/test/CodeGen/X86/bfloat.ll               | 881 +++++++++++++-----
 5 files changed, 660 insertions(+), 252 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7bcc181e2ca91..d9750ea22e2ba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2272,8 +2272,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   if (!Subtarget.useSoftFloat() &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
-    addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
-    addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
+    addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
+                                                        : &X86::VR128RegClass);
+    addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
+                                                         : &X86::VR256RegClass);
     // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
     // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
     // Set the operation action Custom to do the customization later.
@@ -2288,6 +2290,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
+    setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
   }
 
@@ -2299,6 +2302,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
     setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
   }
 
@@ -24049,6 +24053,12 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     return Res;
   }
 
+  if (VT.getScalarType() == MVT::bf16) {
+    if (SVT.getScalarType() == MVT::f32 && isTypeLegal(VT))
+      return Op;
+    return SDValue();
+  }
+
   if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
     if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
       return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index ecb5c3e912401..b5dac7a0c65af 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12976,6 +12976,11 @@ let Predicates = [HasBF16, HasVLX] in {
   def : Pat<(v16bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZ256rr VR128X:$src)>;
 
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256X:$src))),
+            (VCVTNEPS2BF16Z256rr VR256X:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Z256rm addr:$src)>;
+
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
@@ -12985,6 +12990,11 @@ let Predicates = [HasBF16] in {
 
   def : Pat<(v32bf16 (X86VBroadcast (v8bf16 VR128X:$src))),
             (VPBROADCASTWZrr VR128X:$src)>;
+
+  def : Pat<(v16bf16 (X86vfpround (v16f32 VR512:$src))),
+            (VCVTNEPS2BF16Zrr VR512:$src)>;
+  def : Pat<(v16bf16 (X86vfpround (loadv16f32 addr:$src))),
+            (VCVTNEPS2BF16Zrm addr:$src)>;
   // TODO: No scalar broadcast due to we don't support legal scalar bf16 so far.
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 6c57eceab3769..a6fcc804e1d06 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8289,6 +8289,11 @@ let Predicates = [HasAVXNECONVERT] in {
        f256mem>, T8PS;
   let checkVEXPredicate = 1 in
   defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
+
+  def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
+            (VCVTNEPS2BF16Yrr VR256:$src)>;
+  def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))),
+            (VCVTNEPS2BF16Yrm addr:$src)>;
 }
 
 def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
diff --git a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
index 293a67e59e0c9..b311c8831457b 100644
--- a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
@@ -198,7 +198,6 @@ define <8 x bfloat> @test_int_x86_vcvtneps2bf16128(<4 x float> %A) {
 ; CHECK-LABEL: test_int_x86_vcvtneps2bf16128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x7a,0x72,0xc0]
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm0
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x bfloat> @llvm.x86.vcvtneps2bf16128(<4 x float> %A)
   ret <8 x bfloat> %ret
@@ -209,7 +208,6 @@ define <8 x bfloat> @test_int_x86_vcvtneps2bf16256(<8 x float> %A) {
 ; CHECK-LABEL: test_int_x86_vcvtneps2bf16256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0xc4,0xe2,0x7e,0x72,0xc0]
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm0
 ; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x bfloat> @llvm.x86.vcvtneps2bf16256(<8 x float> %A)
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 6798adaf1e5f2..7a82515ad24b7 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,AVXNC
 
 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; SSE2-LABEL: add:
@@ -785,26 +785,13 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: pr62997:
-; F16:       # %bb.0:
-; F16-NEXT:    vmovd %xmm1, %eax
-; F16-NEXT:    vmovd %xmm0, %ecx
-; F16-NEXT:    vmovd %ecx, %xmm0
-; F16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; F16-NEXT:    retq
-;
-; AVXNC-LABEL: pr62997:
-; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    vmovd %xmm1, %eax
-; AVXNC-NEXT:    vmovd %xmm0, %ecx
-; AVXNC-NEXT:    vmovd %ecx, %xmm0
-; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVXNC-NEXT:    vmovd %xmm0, %eax
-; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; AVXNC-NEXT:    shrl $16, %eax
-; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
-; AVXNC-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVXNC-NEXT:    retq
+; AVX-LABEL: pr62997:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    vmovd %xmm0, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
   %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
   ret <2 x bfloat> %2
@@ -1407,252 +1394,250 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ;
 ; AVXNC-LABEL: pr63017_2:
 ; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_1
-; AVXNC-NEXT:  # %bb.2: # %cond.load
+; AVXNC-NEXT:    vmovdqa %ymm0, %ymm1
+; AVXNC-NEXT:    jne .LBB12_2
+; AVXNC-NEXT:  # %bb.1: # %cond.load
 ; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
 ; AVXNC-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
 ; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm0, %xmm0
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVXNC-NEXT:    jmp .LBB12_3
-; AVXNC-NEXT:  .LBB12_1:
-; AVXNC-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT:    vmovdqa %ymm0, %ymm1
-; AVXNC-NEXT:  .LBB12_3: # %else
+; AVXNC-NEXT:  .LBB12_2: # %else
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_5
-; AVXNC-NEXT:  # %bb.4: # %cond.load1
+; AVXNC-NEXT:    jne .LBB12_4
+; AVXNC-NEXT:  # %bb.3: # %cond.load1
 ; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_5: # %else2
+; AVXNC-NEXT:  .LBB12_4: # %else2
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_7
-; AVXNC-NEXT:  # %bb.6: # %cond.load4
+; AVXNC-NEXT:    jne .LBB12_6
+; AVXNC-NEXT:  # %bb.5: # %cond.load4
 ; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_7: # %else5
+; AVXNC-NEXT:  .LBB12_6: # %else5
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_9
-; AVXNC-NEXT:  # %bb.8: # %cond.load7
+; AVXNC-NEXT:    jne .LBB12_8
+; AVXNC-NEXT:  # %bb.7: # %cond.load7
 ; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_9: # %else8
+; AVXNC-NEXT:  .LBB12_8: # %else8
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_11
-; AVXNC-NEXT:  # %bb.10: # %cond.load10
+; AVXNC-NEXT:    jne .LBB12_10
+; AVXNC-NEXT:  # %bb.9: # %cond.load10
 ; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_11: # %else11
+; AVXNC-NEXT:  .LBB12_10: # %else11
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_13
-; AVXNC-NEXT:  # %bb.12: # %cond.load13
+; AVXNC-NEXT:    jne .LBB12_12
+; AVXNC-NEXT:  # %bb.11: # %cond.load13
 ; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_13: # %else14
+; AVXNC-NEXT:  .LBB12_12: # %else14
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_15
-; AVXNC-NEXT:  # %bb.14: # %cond.load16
+; AVXNC-NEXT:    jne .LBB12_14
+; AVXNC-NEXT:  # %bb.13: # %cond.load16
 ; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_15: # %else17
+; AVXNC-NEXT:  .LBB12_14: # %else17
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_17
-; AVXNC-NEXT:  # %bb.16: # %cond.load19
+; AVXNC-NEXT:    jne .LBB12_16
+; AVXNC-NEXT:  # %bb.15: # %cond.load19
 ; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm0, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_17: # %else20
+; AVXNC-NEXT:  .LBB12_16: # %else20
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_19
-; AVXNC-NEXT:  # %bb.18: # %cond.load22
+; AVXNC-NEXT:    jne .LBB12_18
+; AVXNC-NEXT:  # %bb.17: # %cond.load22
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_19: # %else23
+; AVXNC-NEXT:  .LBB12_18: # %else23
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_21
-; AVXNC-NEXT:  # %bb.20: # %cond.load25
+; AVXNC-NEXT:    jne .LBB12_20
+; AVXNC-NEXT:  # %bb.19: # %cond.load25
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_21: # %else26
+; AVXNC-NEXT:  .LBB12_20: # %else26
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_23
-; AVXNC-NEXT:  # %bb.22: # %cond.load28
+; AVXNC-NEXT:    jne .LBB12_22
+; AVXNC-NEXT:  # %bb.21: # %cond.load28
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_23: # %else29
+; AVXNC-NEXT:  .LBB12_22: # %else29
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_25
-; AVXNC-NEXT:  # %bb.24: # %cond.load31
+; AVXNC-NEXT:    jne .LBB12_24
+; AVXNC-NEXT:  # %bb.23: # %cond.load31
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_25: # %else32
+; AVXNC-NEXT:  .LBB12_24: # %else32
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_27
-; AVXNC-NEXT:  # %bb.26: # %cond.load34
+; AVXNC-NEXT:    jne .LBB12_26
+; AVXNC-NEXT:  # %bb.25: # %cond.load34
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_27: # %else35
+; AVXNC-NEXT:  .LBB12_26: # %else35
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_29
-; AVXNC-NEXT:  # %bb.28: # %cond.load37
+; AVXNC-NEXT:    jne .LBB12_28
+; AVXNC-NEXT:  # %bb.27: # %cond.load37
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_29: # %else38
+; AVXNC-NEXT:  .LBB12_28: # %else38
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_31
-; AVXNC-NEXT:  # %bb.30: # %cond.load40
+; AVXNC-NEXT:    jne .LBB12_30
+; AVXNC-NEXT:  # %bb.29: # %cond.load40
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_31: # %else41
+; AVXNC-NEXT:  .LBB12_30: # %else41
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_33
-; AVXNC-NEXT:  # %bb.32: # %cond.load43
+; AVXNC-NEXT:    jne .LBB12_32
+; AVXNC-NEXT:  # %bb.31: # %cond.load43
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_33: # %else44
+; AVXNC-NEXT:  .LBB12_32: # %else44
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_35
-; AVXNC-NEXT:  # %bb.34: # %cond.load46
+; AVXNC-NEXT:    jne .LBB12_34
+; AVXNC-NEXT:  # %bb.33: # %cond.load46
 ; AVXNC-NEXT:    vpinsrw $0, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_35: # %else47
+; AVXNC-NEXT:  .LBB12_34: # %else47
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_37
-; AVXNC-NEXT:  # %bb.36: # %cond.load49
+; AVXNC-NEXT:    jne .LBB12_36
+; AVXNC-NEXT:  # %bb.35: # %cond.load49
 ; AVXNC-NEXT:    vpinsrw $1, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_37: # %else50
+; AVXNC-NEXT:  .LBB12_36: # %else50
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_39
-; AVXNC-NEXT:  # %bb.38: # %cond.load52
+; AVXNC-NEXT:    jne .LBB12_38
+; AVXNC-NEXT:  # %bb.37: # %cond.load52
 ; AVXNC-NEXT:    vpinsrw $2, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_39: # %else53
+; AVXNC-NEXT:  .LBB12_38: # %else53
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_41
-; AVXNC-NEXT:  # %bb.40: # %cond.load55
+; AVXNC-NEXT:    jne .LBB12_40
+; AVXNC-NEXT:  # %bb.39: # %cond.load55
 ; AVXNC-NEXT:    vpinsrw $3, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_41: # %else56
+; AVXNC-NEXT:  .LBB12_40: # %else56
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_43
-; AVXNC-NEXT:  # %bb.42: # %cond.load58
+; AVXNC-NEXT:    jne .LBB12_42
+; AVXNC-NEXT:  # %bb.41: # %cond.load58
 ; AVXNC-NEXT:    vpinsrw $4, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_43: # %else59
+; AVXNC-NEXT:  .LBB12_42: # %else59
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_45
-; AVXNC-NEXT:  # %bb.44: # %cond.load61
+; AVXNC-NEXT:    jne .LBB12_44
+; AVXNC-NEXT:  # %bb.43: # %cond.load61
 ; AVXNC-NEXT:    vpinsrw $5, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_45: # %else62
+; AVXNC-NEXT:  .LBB12_44: # %else62
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_47
-; AVXNC-NEXT:  # %bb.46: # %cond.load64
+; AVXNC-NEXT:    jne .LBB12_46
+; AVXNC-NEXT:  # %bb.45: # %cond.load64
 ; AVXNC-NEXT:    vpinsrw $6, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_47: # %else65
+; AVXNC-NEXT:  .LBB12_46: # %else65
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_49
-; AVXNC-NEXT:  # %bb.48: # %cond.load67
+; AVXNC-NEXT:    jne .LBB12_48
+; AVXNC-NEXT:  # %bb.47: # %cond.load67
 ; AVXNC-NEXT:    vpinsrw $7, (%rax), %xmm1, %xmm2
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_49: # %else68
+; AVXNC-NEXT:  .LBB12_48: # %else68
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_51
-; AVXNC-NEXT:  # %bb.50: # %cond.load70
+; AVXNC-NEXT:    jne .LBB12_50
+; AVXNC-NEXT:  # %bb.49: # %cond.load70
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_51: # %else71
+; AVXNC-NEXT:  .LBB12_50: # %else71
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_53
-; AVXNC-NEXT:  # %bb.52: # %cond.load73
+; AVXNC-NEXT:    jne .LBB12_52
+; AVXNC-NEXT:  # %bb.51: # %cond.load73
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_53: # %else74
+; AVXNC-NEXT:  .LBB12_52: # %else74
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_55
-; AVXNC-NEXT:  # %bb.54: # %cond.load76
+; AVXNC-NEXT:    jne .LBB12_54
+; AVXNC-NEXT:  # %bb.53: # %cond.load76
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_55: # %else77
+; AVXNC-NEXT:  .LBB12_54: # %else77
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_57
-; AVXNC-NEXT:  # %bb.56: # %cond.load79
+; AVXNC-NEXT:    jne .LBB12_56
+; AVXNC-NEXT:  # %bb.55: # %cond.load79
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_57: # %else80
+; AVXNC-NEXT:  .LBB12_56: # %else80
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_59
-; AVXNC-NEXT:  # %bb.58: # %cond.load82
+; AVXNC-NEXT:    jne .LBB12_58
+; AVXNC-NEXT:  # %bb.57: # %cond.load82
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_59: # %else83
+; AVXNC-NEXT:  .LBB12_58: # %else83
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_61
-; AVXNC-NEXT:  # %bb.60: # %cond.load85
+; AVXNC-NEXT:    jne .LBB12_60
+; AVXNC-NEXT:  # %bb.59: # %cond.load85
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_61: # %else86
+; AVXNC-NEXT:  .LBB12_60: # %else86
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_63
-; AVXNC-NEXT:  # %bb.62: # %cond.load88
+; AVXNC-NEXT:    jne .LBB12_62
+; AVXNC-NEXT:  # %bb.61: # %cond.load88
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_63: # %else89
+; AVXNC-NEXT:  .LBB12_62: # %else89
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_65
-; AVXNC-NEXT:  # %bb.64: # %cond.load91
+; AVXNC-NEXT:    jne .LBB12_64
+; AVXNC-NEXT:  # %bb.63: # %cond.load91
 ; AVXNC-NEXT:    vpbroadcastw (%rax), %ymm2
 ; AVXNC-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15]
 ; AVXNC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT:  .LBB12_65: # %else92
+; AVXNC-NEXT:  .LBB12_64: # %else92
 ; AVXNC-NEXT:    retq
   %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
   ret <32 x bfloat> %1
@@ -1714,30 +1699,11 @@ define <4 x float> @pr64460_1(<4 x bfloat> %a) {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: pr64460_1:
-; F16:       # %bb.0:
-; F16-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; F16-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; F16-NEXT:    retq
-;
-; AVXNC-LABEL: pr64460_1:
-; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm1
-; AVXNC-NEXT:    vmovd %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm0
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVXNC-NEXT:    retq
+; AVX-LABEL: pr64460_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    retq
   %b = fpext <4 x bfloat> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -1782,46 +1748,11 @@ define <8 x float> @pr64460_2(<8 x bfloat> %a) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    retq
 ;
-; F16-LABEL: pr64460_2:
-; F16:       # %bb.0:
-; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; F16-NEXT:    vpslld $16, %ymm0, %ymm0
-; F16-NEXT:    retq
-;
-; AVXNC-LABEL: pr64460_2:
-; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm1
-; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vmovd %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm0
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVXNC-NEXT:    retq
+; AVX-LABEL: pr64460_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpslld $16, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %b = fpext <8 x bfloat> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1909,70 +1840,12 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
 ;
 ; AVXNC-LABEL: pr64460_3:
 ; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm1
-; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm2
-; AVXNC-NEXT:    vmovd %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXNC-NEXT:    vpslld $16, %ymm1, %ymm2
 ; AVXNC-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVXNC-NEXT:    vpextrw $5, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm1
-; AVXNC-NEXT:    vpextrw $4, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
-; AVXNC-NEXT:    vpextrw $6, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVXNC-NEXT:    vpextrw $7, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; AVXNC-NEXT:    vpextrw $1, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm3
-; AVXNC-NEXT:    vmovd %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm4
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; AVXNC-NEXT:    vpextrw $2, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm4
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; AVXNC-NEXT:    vpextrw $3, %xmm0, %eax
-; AVXNC-NEXT:    shll $16, %eax
-; AVXNC-NEXT:    vmovd %eax, %xmm0
-; AVXNC-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVXNC-NEXT:    vmovaps %ymm2, %ymm0
+; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXNC-NEXT:    vpslld $16, %ymm0, %ymm1
+; AVXNC-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVXNC-NEXT:    retq
   %b = fpext <16 x bfloat> %a to <16 x float>
   ret <16 x float> %b
@@ -2076,3 +1949,515 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
   %b = fpext <8 x bfloat> %a to <8 x double>
   ret <8 x double> %b
 }
+
+define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $32, %rsp
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pinsrw $0, %eax, %xmm0
+; SSE2-NEXT:    pinsrw $0, %r14d, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pinsrw $0, %ebp, %xmm0
+; SSE2-NEXT:    pinsrw $0, %ebx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $32, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v4f32:
+; F16:       # %bb.0:
+; F16-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v4f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    retq
+  %b = fptrunc <4 x float> %a to <4 x bfloat>
+  ret <4 x bfloat> %b
+}
+
+define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $32, %rsp
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %rbx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    addq $32, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v8f32:
+; F16:       # %bb.0:
+; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v8f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    retq
+  %b = fptrunc <8 x float> %a to <8 x bfloat>
+  ret <8 x bfloat> %b
+}
+
+define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
+; SSE2-LABEL: fptrunc_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $64, %rsp
+; SSE2-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r15d
+; SSE2-NEXT:    orl %ebp, %r15d
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    shlq $32, %r14
+; SSE2-NEXT:    orq %r15, %r14
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r12d
+; SSE2-NEXT:    orl %ebp, %r12d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r15d
+; SSE2-NEXT:    orl %ebp, %r15d
+; SSE2-NEXT:    shlq $32, %r15
+; SSE2-NEXT:    orq %r12, %r15
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r12d
+; SSE2-NEXT:    orl %ebp, %r12d
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r12, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %r14, %xmm2
+; SSE2-NEXT:    movq %rbx, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    addq $64, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v16f32:
+; F16:       # %bb.0:
+; F16-NEXT:    vcvtneps2bf16 %zmm0, %ymm0
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v16f32:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    movq %rsp, %rbp
+; AVXNC-NEXT:    andq $-32, %rsp
+; AVXNC-NEXT:    subq $64, %rsp
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
+; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
+; AVXNC-NEXT:    movq %rbp, %rsp
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
+  %b = fptrunc <16 x float> %a to <16 x bfloat>
+  ret <16 x bfloat> %b
+}
+
+define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
+; SSE2-LABEL: fptrunc_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $64, %rsp
+; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %ebx
+; SSE2-NEXT:    orl %ebp, %ebx
+; SSE2-NEXT:    shlq $32, %rbx
+; SSE2-NEXT:    orq %r14, %rbx
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebp, %r14d
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %ebp
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __truncdfbf2@PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebp, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movq %rbx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    addq $64, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: fptrunc_v8f64:
+; F16:       # %bb.0:
+; F16-NEXT:    pushq %rbp
+; F16-NEXT:    pushq %r15
+; F16-NEXT:    pushq %r14
+; F16-NEXT:    pushq %r13
+; F16-NEXT:    pushq %r12
+; F16-NEXT:    pushq %rbx
+; F16-NEXT:    subq $136, %rsp
+; F16-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; F16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; F16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[1,0]
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %ebp
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r14d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r15d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r12d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %r13d
+; F16-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; F16-NEXT:    vmovd %xmm0, %ebx
+; F16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; F16-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; F16-NEXT:    vzeroupper
+; F16-NEXT:    callq __truncdfbf2@PLT
+; F16-NEXT:    vmovd %xmm0, %eax
+; F16-NEXT:    vmovd %eax, %xmm0
+; F16-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; F16-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; F16-NEXT:    addq $136, %rsp
+; F16-NEXT:    popq %rbx
+; F16-NEXT:    popq %r12
+; F16-NEXT:    popq %r13
+; F16-NEXT:    popq %r14
+; F16-NEXT:    popq %r15
+; F16-NEXT:    popq %rbp
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: fptrunc_v8f64:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    pushq %rbp
+; AVXNC-NEXT:    pushq %r15
+; AVXNC-NEXT:    pushq %r14
+; AVXNC-NEXT:    pushq %r13
+; AVXNC-NEXT:    pushq %r12
+; AVXNC-NEXT:    pushq %rbx
+; AVXNC-NEXT:    subq $120, %rsp
+; AVXNC-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %ebp
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r14d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r15d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r12d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %r13d
+; AVXNC-NEXT:    vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVXNC-NEXT:    vmovd %xmm0, %ebx
+; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT:    vzeroupper
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovd %xmm0, %eax
+; AVXNC-NEXT:    vmovd %eax, %xmm0
+; AVXNC-NEXT:    vpinsrw $1, %ebx, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    addq $120, %rsp
+; AVXNC-NEXT:    popq %rbx
+; AVXNC-NEXT:    popq %r12
+; AVXNC-NEXT:    popq %r13
+; AVXNC-NEXT:    popq %r14
+; AVXNC-NEXT:    popq %r15
+; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    retq
+  %b = fptrunc <8 x double> %a to <8 x bfloat>
+  ret <8 x bfloat> %b
+}

From c138c8a72e360c65da7cfe0cd4b716d78cdc428d Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Tue, 29 Aug 2023 20:02:13 +0100
Subject: [PATCH 77/92] [UBSan] Disable the function and kcfi sanitizers on an
 execute-only target.

An execute-only target disallows data access to code sections.
-fsanitize=function and -fsanitize=kcfi instrument indirect function
calls to load a type hash before the function label. This results in a
non-execute access to the code section and a runtime error.

To solve the issue, -fsanitize=function should not be included in any
check group (e.g. undefined) on an execute-only target. If a user passes
-fsanitize=undefined, there is no error and no warning. However, if the
user explicitly passes -fsanitize=function or -fsanitize=kcfi on an
execute-only target, an error will be emitted.

Fixes: https://github.com/llvm/llvm-project/issues/64931.

Reviewed By: MaskRay, probinson, simon_tatham

Differential Revision: https://reviews.llvm.org/D158614
---
 clang/include/clang/Basic/Sanitizers.h        |  9 ++++++++
 clang/lib/Basic/CMakeLists.txt                |  1 +
 clang/lib/Basic/Sanitizers.cpp                | 13 +++++++++++
 clang/lib/Driver/SanitizerArgs.cpp            | 22 +++++++++++++++++++
 .../test/CodeGenObjCXX/crash-function-type.mm |  3 +++
 clang/test/Driver/fsanitize.c                 | 14 ++++++++++++
 6 files changed, 62 insertions(+)

diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index db53010645ae3..c212f80fe03ad 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -23,7 +23,11 @@
 
 namespace llvm {
 class hash_code;
+class Triple;
+namespace opt {
+class ArgList;
 }
+} // namespace llvm
 
 namespace clang {
 
@@ -205,6 +209,11 @@ StringRef AsanDetectStackUseAfterReturnModeToString(
 llvm::AsanDetectStackUseAfterReturnMode
 AsanDetectStackUseAfterReturnModeFromString(StringRef modeStr);
 
+/// Return true if an execute-only target disallows data access to code
+/// sections.
+bool isExecuteOnlyTarget(const llvm::Triple &Triple,
+                         const llvm::opt::ArgList &Args);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_BASIC_SANITIZERS_H
diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index caa1b6002e6f1..d6620ec204ad4 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  Option
   Support
   TargetParser
   )
diff --git a/clang/lib/Basic/Sanitizers.cpp b/clang/lib/Basic/Sanitizers.cpp
index 62ccdf8e9bbf2..6fbc32df31489 100644
--- a/clang/lib/Basic/Sanitizers.cpp
+++ b/clang/lib/Basic/Sanitizers.cpp
@@ -11,10 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/Sanitizers.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Option/ArgList.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/TargetParser/Triple.h"
 
 using namespace clang;
 
@@ -112,4 +115,14 @@ AsanDetectStackUseAfterReturnModeFromString(StringRef modeStr) {
       .Default(llvm::AsanDetectStackUseAfterReturnMode::Invalid);
 }
 
+bool isExecuteOnlyTarget(const llvm::Triple &Triple,
+                         const llvm::opt::ArgList &Args) {
+  if (Triple.isPS5())
+    return true;
+
+  // On Arm, the clang `-mexecute-only` option is used to generate the
+  // execute-only output (no data access to code sections).
+  return Args.hasFlag(clang::driver::options::OPT_mexecute_only,
+                      clang::driver::options::OPT_mno_execute_only, false);
+}
 } // namespace clang
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index c3ce13f93464d..a4e9475947487 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -37,6 +37,8 @@ static const SanitizerMask NeedsUbsanCxxRt =
     SanitizerKind::Vptr | SanitizerKind::CFI;
 static const SanitizerMask NotAllowedWithTrap = SanitizerKind::Vptr;
 static const SanitizerMask NotAllowedWithMinimalRuntime = SanitizerKind::Vptr;
+static const SanitizerMask NotAllowedWithExecuteOnly =
+    SanitizerKind::Function | SanitizerKind::KCFI;
 static const SanitizerMask RequiresPIE =
     SanitizerKind::DataFlow | SanitizerKind::Scudo;
 static const SanitizerMask NeedsUnwindTables =
@@ -395,6 +397,22 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
           DiagnosedKinds |= SanitizerKind::Function;
         }
       }
+      // -fsanitize=function and -fsanitize=kcfi instrument indirect function
+      // calls to load a type hash before the function label. Therefore, an
+      // execute-only target doesn't support the function and kcfi sanitizers.
+      const llvm::Triple &Triple = TC.getTriple();
+      if (isExecuteOnlyTarget(Triple, Args)) {
+        if (SanitizerMask KindsToDiagnose =
+                Add & NotAllowedWithExecuteOnly & ~DiagnosedKinds) {
+          if (DiagnoseErrors) {
+            std::string Desc = describeSanitizeArg(Arg, KindsToDiagnose);
+            D.Diag(diag::err_drv_argument_not_allowed_with)
+                << Desc << Triple.str();
+          }
+          DiagnosedKinds |= KindsToDiagnose;
+        }
+        Add &= ~NotAllowedWithExecuteOnly;
+      }
 
       // FIXME: Make CFI on member function calls compatible with cross-DSO CFI.
       // There are currently two problems:
@@ -457,6 +475,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
       if (MinimalRuntime) {
         Add &= ~NotAllowedWithMinimalRuntime;
       }
+      // NotAllowedWithExecuteOnly is silently discarded on an execute-only
+      // target if implicitly enabled through group expansion.
+      if (isExecuteOnlyTarget(Triple, Args))
+        Add &= ~NotAllowedWithExecuteOnly;
       if (CfiCrossDso)
         Add &= ~SanitizerKind::CFIMFCall;
       Add &= Supported;
diff --git a/clang/test/CodeGenObjCXX/crash-function-type.mm b/clang/test/CodeGenObjCXX/crash-function-type.mm
index 53acc58dfc44d..280497a3258a4 100644
--- a/clang/test/CodeGenObjCXX/crash-function-type.mm
+++ b/clang/test/CodeGenObjCXX/crash-function-type.mm
@@ -1,3 +1,6 @@
+// Mark test as unsupported on PS5 due to PS5 doesn't support function sanitizer.
+// UNSUPPORTED: target=x86_64-sie-ps5
+
 // RUN: %clang_cc1 -fblocks -fsanitize=function -emit-llvm %s -o %t
 
 void g(void (^)());
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 182de9f486444..9442f6b91471f 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -971,3 +971,17 @@
 
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined,function -mcmodel=large %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION-CODE-MODEL
 // CHECK-UBSAN-FUNCTION-CODE-MODEL: error: invalid argument '-fsanitize=function' only allowed with '-mcmodel=small'
+
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=undefined -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=kcfi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-KCFI
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=function -fsanitize=kcfi %s -### 2>&1 | FileCheck %s  --check-prefix=CHECK-UBSAN-KCFI --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=x86_64-sie-ps5 -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-UNDEFINED
+
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-FUNCTION
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=kcfi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-KCFI
+// RUN: %clang --target=armv6t2-eabi -mexecute-only -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-UNDEFINED
+
+// CHECK-UBSAN-KCFI-DAG: error: invalid argument '-fsanitize=kcfi' not allowed with {{('x86_64-sie-ps5'|'armv6t2-unknown-unknown-eabi')}}
+// CHECK-UBSAN-FUNCTION-DAG: error: invalid argument '-fsanitize=function' not allowed with {{('x86_64-sie-ps5'|'armv6t2-unknown-unknown-eabi')}}
+// CHECK-UBSAN-UNDEFINED: "-fsanitize={{((alignment|array-bounds|bool|builtin|enum|float-cast-overflow|integer-divide-by-zero|nonnull-attribute|null|pointer-overflow|return|returns-nonnull-attribute|shift-base|shift-exponent|signed-integer-overflow|unreachable|vla-bound),?){17}"}}

From 051aa171d2177776964cc309d14d8333af66135f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 30 Aug 2023 19:29:50 +0000
Subject: [PATCH 78/92] [Driver] Adjust -fsanitize=function & -mexecute-only
 interop after D158614

clangDriver depends on clangBasic, so clangBasic should not depend on
clangDriver, even just its header. Also remove clangBasic's dependency
on LLVMOption.

The issue can be seen through the bazel commit
d26dd681f9726ed7d43d7c0bdd8ee3cb2db69a2b which is reverted now.

Add hasFlagNoClaim and use it as we don't want to suppress
-Wunused-command-line-argument for -mexecute-only just because
-fsanitize= is specified.
---
 clang/include/clang/Basic/Sanitizers.h |  5 -----
 clang/lib/Basic/CMakeLists.txt         |  1 -
 clang/lib/Basic/Sanitizers.cpp         | 13 -------------
 clang/lib/Driver/SanitizerArgs.cpp     | 10 ++++++++++
 llvm/include/llvm/Option/ArgList.h     |  1 +
 llvm/lib/Option/ArgList.cpp            |  7 +++++++
 6 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/clang/include/clang/Basic/Sanitizers.h b/clang/include/clang/Basic/Sanitizers.h
index c212f80fe03ad..4659e45c78834 100644
--- a/clang/include/clang/Basic/Sanitizers.h
+++ b/clang/include/clang/Basic/Sanitizers.h
@@ -209,11 +209,6 @@ StringRef AsanDetectStackUseAfterReturnModeToString(
 llvm::AsanDetectStackUseAfterReturnMode
 AsanDetectStackUseAfterReturnModeFromString(StringRef modeStr);
 
-/// Return true if an execute-only target disallows data access to code
-/// sections.
-bool isExecuteOnlyTarget(const llvm::Triple &Triple,
-                         const llvm::opt::ArgList &Args);
-
 } // namespace clang
 
 #endif // LLVM_CLANG_BASIC_SANITIZERS_H
diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index d6620ec204ad4..caa1b6002e6f1 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS
-  Option
   Support
   TargetParser
   )
diff --git a/clang/lib/Basic/Sanitizers.cpp b/clang/lib/Basic/Sanitizers.cpp
index 6fbc32df31489..62ccdf8e9bbf2 100644
--- a/clang/lib/Basic/Sanitizers.cpp
+++ b/clang/lib/Basic/Sanitizers.cpp
@@ -11,13 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/Sanitizers.h"
-#include "clang/Driver/Options.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Option/ArgList.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/TargetParser/Triple.h"
 
 using namespace clang;
 
@@ -115,14 +112,4 @@ AsanDetectStackUseAfterReturnModeFromString(StringRef modeStr) {
       .Default(llvm::AsanDetectStackUseAfterReturnMode::Invalid);
 }
 
-bool isExecuteOnlyTarget(const llvm::Triple &Triple,
-                         const llvm::opt::ArgList &Args) {
-  if (Triple.isPS5())
-    return true;
-
-  // On Arm, the clang `-mexecute-only` option is used to generate the
-  // execute-only output (no data access to code sections).
-  return Args.hasFlag(clang::driver::options::OPT_mexecute_only,
-                      clang::driver::options::OPT_mno_execute_only, false);
-}
 } // namespace clang
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index a4e9475947487..12fe55be9113e 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -143,6 +143,16 @@ static std::string describeSanitizeArg(const llvm::opt::Arg *A,
 /// Sanitizers set.
 static std::string toString(const clang::SanitizerSet &Sanitizers);
 
+/// Return true if an execute-only target disallows data access to code
+/// sections.
+static bool isExecuteOnlyTarget(const llvm::Triple &Triple,
+                                const llvm::opt::ArgList &Args) {
+  if (Triple.isPS5())
+    return true;
+  return Args.hasFlagNoClaim(options::OPT_mexecute_only,
+                             options::OPT_mno_execute_only, false);
+}
+
 static void validateSpecialCaseListFormat(const Driver &D,
                                           std::vector<std::string> &SCLFiles,
                                           unsigned MalformedSCLErrorDiagID,
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index 310c8900af9ef..c57bd2350af14 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -299,6 +299,7 @@ class ArgList {
   /// \p Default if neither option is given. If both the option and its
   /// negation are present, the last one wins.
   bool hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
+  bool hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg, bool Default) const;
 
   /// hasFlag - Given an option \p Pos, an alias \p PosAlias and its negative
   /// form \p Neg, return true if the option or its alias is present, false if
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 400bedabc0037..86f28e578e5d9 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -75,6 +75,13 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
   return Default;
 }
 
+bool ArgList::hasFlagNoClaim(OptSpecifier Pos, OptSpecifier Neg,
+                             bool Default) const {
+  if (Arg *A = getLastArgNoClaim(Pos, Neg))
+    return A->getOption().matches(Pos);
+  return Default;
+}
+
 bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
                       bool Default) const {
   if (Arg *A = getLastArg(Pos, PosAlias, Neg))

From a612cb0b81d8b2573c30e5ae89a8e899999b045b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 31 Aug 2023 09:31:25 -0700
Subject: [PATCH 79/92] [sanitizer][test] -std=c2x instead of -std=c23

Adjust tests from dd230efe703f34678ce52280e50238abf908aaa1 to use
-std=c2x instead, as Clang in release/17.x doesn't support -std=c23.
---
 compiler-rt/test/sanitizer_common/TestCases/scanf.c  | 2 +-
 compiler-rt/test/sanitizer_common/TestCases/strtol.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/scanf.c b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
index a7f35c2af57ee..a42d9f72a71d9 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/scanf.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/scanf.c
@@ -1,6 +1,6 @@
 // RUN: %clang -std=c17 %s -o %t && %run %t
 /// Test __isoc23_* for glibc 2.38+.
-// RUN: %clang -std=c23 %s -o %t && %run %t
+// RUN: %clang -std=c2x %s -o %t && %run %t
 
 #include <assert.h>
 #include <stdarg.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/strtol.c b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
index 9947cdeacd8c3..c3de9bcb7aa04 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/strtol.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/strtol.c
@@ -1,6 +1,6 @@
 // RUN: %clang -std=c17 %s -o %t && %run %t
 /// Test __isoc23_* for glibc 2.38+.
-// RUN: %clang -std=c23 %s -o %t && %run %t
+// RUN: %clang -std=c2x %s -o %t && %run %t
 
 #include <assert.h>
 #include <inttypes.h>

From 6a562bbd51c9ad1ff522d19a0b247595c1f85184 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 28 Jul 2023 17:58:15 +0200
Subject: [PATCH 80/92] [Tooling/Inclusion] Add std::range symbols in the
 mapping.

Fixes https://github.com/llvm/llvm-project/issues/64191

Differential Revision: https://reviews.llvm.org/D156648

(cherry picked from commit 171868dc2cd60c6e3eaeb3861b18ba0e22461291)
---
 .../Inclusions/Stdlib/StdSymbolMap.inc        | 54 +++++++++++++++++++
 clang/tools/include-mapping/gen_std.py        |  5 ++
 2 files changed, 59 insertions(+)

diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
index a08ec11e77a4a..b46bd2e4d7a4b 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
@@ -3773,6 +3773,33 @@ SYMBOL(viewable_range, std::ranges::, <ranges>)
 SYMBOL(wistream_view, std::ranges::, <ranges>)
 SYMBOL(zip_transform_view, std::ranges::, <ranges>)
 SYMBOL(zip_view, std::ranges::, <ranges>)
+SYMBOL(all, std::ranges::views::, <ranges>)
+SYMBOL(all_t, std::ranges::views::, <ranges>)
+SYMBOL(as_const, std::ranges::views::, <ranges>)
+SYMBOL(as_rvalue, std::ranges::views::, <ranges>)
+SYMBOL(common, std::ranges::views::, <ranges>)
+SYMBOL(counted, std::ranges::views::, <ranges>)
+SYMBOL(drop, std::ranges::views::, <ranges>)
+SYMBOL(drop_while, std::ranges::views::, <ranges>)
+SYMBOL(elements, std::ranges::views::, <ranges>)
+SYMBOL(empty, std::ranges::views::, <ranges>)
+SYMBOL(filter, std::ranges::views::, <ranges>)
+SYMBOL(iota, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <ranges>)
+SYMBOL(istream, std::ranges::views::, <iosfwd>)
+SYMBOL(join, std::ranges::views::, <ranges>)
+SYMBOL(join_with, std::ranges::views::, <ranges>)
+SYMBOL(keys, std::ranges::views::, <ranges>)
+SYMBOL(lazy_split, std::ranges::views::, <ranges>)
+SYMBOL(reverse, std::ranges::views::, <ranges>)
+SYMBOL(single, std::ranges::views::, <ranges>)
+SYMBOL(split, std::ranges::views::, <ranges>)
+SYMBOL(take, std::ranges::views::, <ranges>)
+SYMBOL(take_while, std::ranges::views::, <ranges>)
+SYMBOL(transform, std::ranges::views::, <ranges>)
+SYMBOL(values, std::ranges::views::, <ranges>)
+SYMBOL(zip, std::ranges::views::, <ranges>)
+SYMBOL(zip_transform, std::ranges::views::, <ranges>)
 SYMBOL(ECMAScript, std::regex_constants::, <regex>)
 SYMBOL(awk, std::regex_constants::, <regex>)
 SYMBOL(basic, std::regex_constants::, <regex>)
@@ -3817,3 +3844,30 @@ SYMBOL(get_id, std::this_thread::, <thread>)
 SYMBOL(sleep_for, std::this_thread::, <thread>)
 SYMBOL(sleep_until, std::this_thread::, <thread>)
 SYMBOL(yield, std::this_thread::, <thread>)
+SYMBOL(all, std::views::, <ranges>)
+SYMBOL(all_t, std::views::, <ranges>)
+SYMBOL(as_const, std::views::, <ranges>)
+SYMBOL(as_rvalue, std::views::, <ranges>)
+SYMBOL(common, std::views::, <ranges>)
+SYMBOL(counted, std::views::, <ranges>)
+SYMBOL(drop, std::views::, <ranges>)
+SYMBOL(drop_while, std::views::, <ranges>)
+SYMBOL(elements, std::views::, <ranges>)
+SYMBOL(empty, std::views::, <ranges>)
+SYMBOL(filter, std::views::, <ranges>)
+SYMBOL(iota, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <ranges>)
+SYMBOL(istream, std::views::, <iosfwd>)
+SYMBOL(join, std::views::, <ranges>)
+SYMBOL(join_with, std::views::, <ranges>)
+SYMBOL(keys, std::views::, <ranges>)
+SYMBOL(lazy_split, std::views::, <ranges>)
+SYMBOL(reverse, std::views::, <ranges>)
+SYMBOL(single, std::views::, <ranges>)
+SYMBOL(split, std::views::, <ranges>)
+SYMBOL(take, std::views::, <ranges>)
+SYMBOL(take_while, std::views::, <ranges>)
+SYMBOL(transform, std::views::, <ranges>)
+SYMBOL(values, std::views::, <ranges>)
+SYMBOL(zip, std::views::, <ranges>)
+SYMBOL(zip_transform, std::views::, <ranges>)
diff --git a/clang/tools/include-mapping/gen_std.py b/clang/tools/include-mapping/gen_std.py
index 2390ff1f2cced..57a5a6772ba89 100755
--- a/clang/tools/include-mapping/gen_std.py
+++ b/clang/tools/include-mapping/gen_std.py
@@ -242,6 +242,11 @@ def main():
             (symbol_index_root, "filesystem.html", "std::filesystem::"),
             (symbol_index_root, "pmr.html", "std::pmr::"),
             (symbol_index_root, "ranges.html", "std::ranges::"),
+
+            (symbol_index_root, "views.html", "std::ranges::views::"),
+            # std::ranges::views can be accessed as std::views.
+            (symbol_index_root, "views.html", "std::views::"),
+
             (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
             (symbol_index_root, "this_thread.html", "std::this_thread::"),
             # Zombie symbols that were available from the Standard Library, but are

From 466677b126855c79a05a1ebc111eea48053ad4ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 30 Aug 2023 23:29:15 +0300
Subject: [PATCH 81/92] [llvm-windres] Implement the windres flag
 --use-temp-file

Whether a temp file or a pipe is used for preprocessing is an
internal detail, this flag has a notable effect on the preprocessing
in GNU windres. Without this flag, GNU windres passes command
arguments as-is to popen(), which means they get evaluated by a
shell without being re-escaped for this case. To mimic this,
llvm-windres has manually tried to unescape arguments.

When GNU windres is given the --use-temp-file flag, it uses a
different API for invoking the preprocessor, and this API takes care
of preserving special characters in the command line arguments.
For users of GNU windres, this means that by using --use-temp-file,
they don't need to do the (quite terrible) double escaping of
quotes/spaces etc.

The xz project uses the --use-temp-file flag when invoking
GNU windres, see
https://github.com/tukaani-project/xz/commit/6b117d3b1fe91eb26d533ab16a2e552f84148d47.
However as llvm-windres didn't implement this flag and just
assumed the GNU windres popen() behaviour, they had to use a
different codepath for llvm-windres.

That separate codepath for llvm-windres broke later when llvm-windres
got slightly more accurate unescaping of lone quotes in
0f4c6b120f21d582ab7c5c4f2b2a475086c34938 /
https://reviews.llvm.org/D146848 (fixing a discrepancy to GNU
windres as found in https://github.com/llvm/llvm-project/issues/57334),
and this was reported in
https://github.com/mstorsjo/llvm-mingw/issues/363.

Not touching the implementation of the --preprocessor option
with respect to the --use-temp-file flag; that option is doubly
tricky as GNU windres changed its behaviour in a backwards incompatible
way recently (and llvm-windres currently matches the old behaviour).
(See
https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=21c33bcbe36377abf01614fb1b9be439a3b6de20,
https://sourceware.org/bugzilla/show_bug.cgi?id=27594 and
https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=5edb8e3f5ad8d74a83fc0df7f6e4514eed0aa77f;hp=3abbafc2aacc6706fea3e3e326e2f08d107c3672
for the behaviour change.)

Differential Revision: https://reviews.llvm.org/D159223

(cherry picked from commit 2bcc0fdc58a220cb9921b47ec8a32c85f2511a47)
---
 llvm/test/tools/llvm-rc/windres-preproc.test |  1 +
 llvm/tools/llvm-rc/WindresOpts.td            |  7 ++++---
 llvm/tools/llvm-rc/llvm-rc.cpp               | 17 +++++++++++++----
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/llvm/test/tools/llvm-rc/windres-preproc.test b/llvm/test/tools/llvm-rc/windres-preproc.test
index 888be03f7d9e4..e55195b3a4d28 100644
--- a/llvm/test/tools/llvm-rc/windres-preproc.test
+++ b/llvm/test/tools/llvm-rc/windres-preproc.test
@@ -4,6 +4,7 @@
 ; REQUIRES: shell
 
 ; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\\\"foo bar\\\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\\\"baz baz\\\"" -DFOO5=\"bar\" %p/Inputs/empty.rc %t.res | FileCheck %s --check-prefix=CHECK1
+; RUN: llvm-windres -### --include-dir %p/incdir1 --include %p/incdir2 "-DFOO1=\"foo bar\"" -UFOO2 -D FOO3 --preprocessor-arg "-DFOO4=\"baz baz\"" "-DFOO5=bar" %p/Inputs/empty.rc %t.res --use-temp-file | FileCheck %s --check-prefix=CHECK1
 ; CHECK1: {{^}} "clang" "--driver-mode=gcc" "-target" "{{.*}}-{{.*}}{{mingw32|windows-gnu}}" "-E" "-xc" "-DRC_INVOKED" "{{.*}}empty.rc" "-o" "{{.*}}preproc-{{.*}}.rc" "-I" "{{.*}}incdir1" "-I" "{{.*}}incdir2" "-D" "FOO1=\"foo bar\"" "-U" "FOO2" "-D" "FOO3" "-DFOO4=\"baz baz\"" "-D" "FOO5=bar"{{$}}
 ; RUN: llvm-windres -### --preprocessor "i686-w64-mingw32-gcc -E -DFOO=\\\"foo\\ bar\\\"" %p/Inputs/empty.rc %t.res | FileCheck %s --check-prefix=CHECK2
 ; CHECK2: {{^}} "i686-w64-mingw32-gcc" "-E" "-DFOO=\"foo bar\"" "{{.*}}empty.rc" "-o" "{{.*}}preproc-{{.*}}.rc"{{$}}
diff --git a/llvm/tools/llvm-rc/WindresOpts.td b/llvm/tools/llvm-rc/WindresOpts.td
index 3c75c85ece0f6..42a56dbfda4cd 100644
--- a/llvm/tools/llvm-rc/WindresOpts.td
+++ b/llvm/tools/llvm-rc/WindresOpts.td
@@ -48,6 +48,10 @@ defm codepage : LongShort<"c", "codepage", "Default codepage to use">;
 
 defm language : LongShort<"l", "language", "Default language to use (0x0-0xffff)">;
 
+def use_temp_file: Flag<["--"], "use-temp-file">,
+                   HelpText<"Mimic GNU windres preprocessor option handling "
+                            "(don't unescape preprocessor options)">;
+
 defm verbose : F<"v", "verbose", "Enable verbose output">;
 defm version : F<"V", "version", "Display version">;
 
@@ -57,6 +61,3 @@ defm help : F<"h", "help", "Display this message and exit">;
 def _HASH_HASH_HASH : Flag<["-"], "###">;
 
 def no_preprocess : Flag<["--"], "no-preprocess">;
-
-// Unimplemented options for compatibility
-def use_temp_file: Flag<["--"], "use-temp-file">;
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index 233b888546a81..0caa8117cb70b 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -473,7 +473,14 @@ RcOptions parseWindresOptions(ArrayRef<const char *> ArgsArr,
     // done this double escaping) probably is confined to cases like these
     // quoted string defines, and those happen to work the same across unix
     // and windows.
-    std::string Unescaped = unescape(Arg->getValue());
+    //
+    // If GNU windres is executed with --use-temp-file, it doesn't use
+    // popen() to invoke the preprocessor, but uses another function which
+    // actually preserves tricky characters better. To mimic this behaviour,
+    // don't unescape arguments here.
+    std::string Value = Arg->getValue();
+    if (!InputArgs.hasArg(WINDRES_use_temp_file))
+      Value = unescape(Value);
     switch (Arg->getOption().getID()) {
     case WINDRES_include_dir:
       // Technically, these are handled the same way as e.g. defines, but
@@ -487,17 +494,19 @@ RcOptions parseWindresOptions(ArrayRef<const char *> ArgsArr,
       break;
     case WINDRES_define:
       Opts.PreprocessArgs.push_back("-D");
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     case WINDRES_undef:
       Opts.PreprocessArgs.push_back("-U");
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     case WINDRES_preprocessor_arg:
-      Opts.PreprocessArgs.push_back(Unescaped);
+      Opts.PreprocessArgs.push_back(Value);
       break;
     }
   }
+  // TODO: If --use-temp-file is set, we shouldn't be unescaping
+  // the --preprocessor argument either, only splitting it.
   if (InputArgs.hasArg(WINDRES_preprocessor))
     Opts.PreprocessCmd =
         unescapeSplit(InputArgs.getLastArgValue(WINDRES_preprocessor));

From ca372df48a05534d42130e6ecb4f6b275e003a08 Mon Sep 17 00:00:00 2001
From: hstk30 <htsk30@gmail.com>
Date: Thu, 31 Aug 2023 17:54:57 +0100
Subject: [PATCH 82/92] [AArch64] Fix arm neon vstx lane memVT size

StN lane memory size set too big lead to alias analysis goes wrong.

Fixes https://github.com/llvm/llvm-project/issues/64696

Differential Revision: https://reviews.llvm.org/D158611

(cherry picked from commit db8f6c009e5a17d304be7404e50eb20b2dd0c75b)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  50 +++++++--
 .../CodeGen/AArch64/arm64-neon-st-lane-aa.ll  |  34 ++++++
 .../CodeGen/AArch64/multi-vector-load-size.ll | 106 ++++++++++++++++++
 .../AArch64/multi-vector-store-size.ll        |  12 +-
 4 files changed, 185 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
 create mode 100644 llvm/test/CodeGen/AArch64/multi-vector-load-size.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0605dfa637939..c7a6dd7deb45b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13840,7 +13840,17 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4:
   case Intrinsic::aarch64_neon_ld1x2:
   case Intrinsic::aarch64_neon_ld1x3:
-  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld1x4: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile loads with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::aarch64_neon_ld2lane:
   case Intrinsic::aarch64_neon_ld3lane:
   case Intrinsic::aarch64_neon_ld4lane:
@@ -13848,9 +13858,13 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld3r:
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    // ldx return struct with the same vec type
+    Type *RetTy = I.getType();
+    auto *StructTy = cast<StructType>(RetTy);
+    unsigned NumElts = StructTy->getNumElements();
+    Type *VecTy = StructTy->getElementType(0);
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
@@ -13863,20 +13877,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_st4:
   case Intrinsic::aarch64_neon_st1x2:
   case Intrinsic::aarch64_neon_st1x3:
-  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st1x4: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    unsigned NumElts = 0;
+    for (const Value *Arg : I.args()) {
+      Type *ArgTy = Arg->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+    Info.offset = 0;
+    Info.align.reset();
+    // volatile stores with NEON intrinsics not supported
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::aarch64_neon_st2lane:
   case Intrinsic::aarch64_neon_st3lane:
   case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
+    // all the vector type is same
+    Type *VecTy = I.getArgOperand(0)->getType();
+    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+
     for (const Value *Arg : I.args()) {
       Type *ArgTy = Arg->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+      NumElts += 1;
     }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
new file mode 100644
index 0000000000000..7642597c91f2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s
+
+; st2 must before two ldrb.
+; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane,
+; which lead to basic-aa goes wrong.
+
+define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr {
+; CHECK-LABEL:   test_vst2_lane_u8:
+; CHECK:         st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8]
+; CHECK-NEXT:    umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6]
+; CHECK-NEXT:    ldrb w[[W2:[0-9]+]], [sp, #12]
+; CHECK-NEXT:    ldrb w[[W2:[0-9]+]], [sp, #13]
+entry:
+  %temp = alloca [2 x i8], align 4
+  %vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0
+  %vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1
+  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp)
+  %0 = load i8, ptr %temp, align 4
+  %vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6
+  %cmp8.not = icmp ne i8 %0, %vget_lane
+  %arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1
+  %1 = load i8, ptr %arrayidx3.1, align 1
+  %vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6
+  %cmp8.not.1 = icmp ne i8 %1, %vget_lane.1
+  %or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1
+  %cmp.lcssa = zext i1 %or.cond to i32
+  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4
+  ret i32 %cmp.lcssa
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
new file mode 100644
index 0000000000000..ecb953366a88e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
@@ -0,0 +1,106 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) {
+  ; CHECK-LABEL: name: test_ld2
+  ; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) {
+  ; CHECK-LABEL: name: test_ld3
+  ; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) {
+  ; CHECK-LABEL: name: test_ld4
+  ; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x2
+  ; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x3
+  ; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) {
+  ; CHECK-LABEL: name: test_ld1x4
+  ; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld2r
+  ; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}})
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld3r
+  ; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}})
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) {
+  ; CHECK-LABEL: name: test_ld4r
+  ; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}})
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) {
+  ; CHECK-LABEL: name: test_ld2lane
+  ; CHECK: {{.*}} LD2i32 {{.*}}
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) {
+  ; CHECK-LABEL: name: test_ld3lane
+  ; CHECK: {{.*}} LD3i32 {{.*}}
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) {
+  ; CHECK-LABEL: name: test_ld4lane
+  ; CHECK: {{.*}} LD4i32 {{.*}}
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
index 5763ec61667f2..3710db9c47ff6 100644
--- a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
+++ b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -23,8 +23,6 @@ define void @addstx(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
 ; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}})
   tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -46,8 +44,6 @@ define void @addst1x(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
 ; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}})
   tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -69,14 +65,12 @@ define void @addstxlane(ptr %res, ptr %a,  ptr %b, ptr %c, ptr %d) {
   %cr = fadd <4 x float> %cl, %dl
   %dr = fadd <4 x float> %dl, %al
 
-; The sizes below are conservative.  AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
   tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res)
-; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}})
+; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}})
   tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res)
-; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}})
+; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}})
   tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res)
-; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}})
+; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}})
 
   ret void
 }

From e04acab63c2682c690790111d688419158f15978 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 31 Aug 2023 23:26:46 -0700
Subject: [PATCH 83/92] [Driver] Report warnings for unclaimed TargetSpecific
 options for assembler input

This patch amends D151590 to not error for unlaimed TargetSpecific
options for `-x assembler` input files. This input type causes Driver to
construct tools::ClangAs (-fintegrated-as) or other assemblers (e.g.
tools::gnutools::Assembler) Their ConstructJobs methods, unlike
Clang::ConstructJobs, claim very few options. If an option is unclaimed,
it either leads to a -Wunused-command-line-argument warning or an error
(if `TargetSpecific` is set):
```
% clang '-###' --target=aarch64 -mbranch-protection=bti -c a.s
clang: error: unsupported option '-mbranch-protection=' for target 'aarch64'
```

It seems that downgrading the diagnostic to warning is most useful as
many users use CFLAGS even for `.s` files:
```
clang --target=aarch64 -mbranch-protection=bti -S a.c
clang --target=aarch64 -mbranch-protection=bti -c a.s
```

I decide not to suppress the warning so that
-Wunused-command-line-argument lovers still get a warning, and help
projects use proper ASFLAGS/CFLAGS/etc.

Note: `-mbranch-protection=bti a.S` currently has no warning as `-x assembler-with-cpp`
instructs clangDriver to select tools::Clang and claim most options.

Revert D159010 to demonstrate that we emit a warning for -mfpmath= for
`-x assembler` input.

Modify my AIX cleanup cd18efb61d759405956dbd30e4b5f2720d8e1783 to
add an err_drv_unsupported_opt_for_target.

Reviewed By: thesamesam

Differential Revision: https://reviews.llvm.org/D159173

(cherry picked from commit e9d454d1c195958645fb0948f8b97262e7f8b33a)
---
 clang/lib/Driver/Driver.cpp                |  8 +++++++-
 clang/lib/Driver/ToolChains/AIX.cpp        |  6 ++++++
 clang/lib/Driver/ToolChains/Arch/X86.cpp   |  8 +-------
 clang/lib/Driver/ToolChains/Arch/X86.h     |  2 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  2 +-
 clang/test/Driver/target-specific.s        | 12 ++++++++++++
 clang/test/Driver/x86-mfpmath.c            |  4 ++--
 7 files changed, 30 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/Driver/target-specific.s

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f6ea4d0b43667..bdbdad9362e19 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4936,6 +4936,12 @@ void Driver::BuildJobs(Compilation &C) const {
   (void)C.getArgs().hasArg(options::OPT_driver_mode);
   (void)C.getArgs().hasArg(options::OPT_rsp_quoting);
 
+  bool HasAssembleJob = llvm::any_of(C.getJobs(), [](auto &J) {
+    // Match ClangAs and other derived assemblers of Tool. ClangAs uses a
+    // longer ShortName "clang integrated assembler" while other assemblers just
+    // use "assembler".
+    return strstr(J.getCreator().getShortName(), "assembler");
+  });
   for (Arg *A : C.getArgs()) {
     // FIXME: It would be nice to be able to send the argument to the
     // DiagnosticsEngine, so that extra values, position, and so on could be
@@ -4965,7 +4971,7 @@ void Driver::BuildJobs(Compilation &C) const {
       // already been warned about.
       if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) {
         if (A->getOption().hasFlag(options::TargetSpecific) &&
-            !A->isIgnoredTargetSpecific()) {
+            !A->isIgnoredTargetSpecific() && !HasAssembleJob) {
           Diag(diag::err_drv_unsupported_opt_for_target)
               << A->getSpelling() << getTargetTriple();
         } else {
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 97217eba9ca01..bfc86d9f34718 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -30,6 +30,7 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfoList &Inputs,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {
+  const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   const bool IsArch32Bit = getToolChain().getTriple().isArch32Bit();
@@ -38,6 +39,11 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   if (!IsArch32Bit && !IsArch64Bit)
     llvm_unreachable("Unsupported bit width value.");
 
+  if (Arg *A = C.getArgs().getLastArg(options::OPT_G)) {
+    D.Diag(diag::err_drv_unsupported_opt_for_target)
+        << A->getSpelling() << D.getTargetTriple();
+  }
+
   // Specify the mode in which the as(1) command operates.
   if (IsArch32Bit) {
     CmdArgs.push_back("-a32");
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 4383b80041435..cf2bc63d74ada 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -118,13 +118,7 @@ std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
 
 void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args,
-                               std::vector<StringRef> &Features, bool ForAS) {
-  if (ForAS) {
-    // Some target-specific options are only handled in AddX86TargetArgs, which
-    // is not called by ClangAs::ConstructJob. Claim them here.
-    Args.claimAllArgs(options::OPT_mfpmath_EQ);
-  }
-
+                               std::vector<StringRef> &Features) {
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
   if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.h b/clang/lib/Driver/ToolChains/Arch/X86.h
index 762a1fa6f4d5f..e07387f3ece3d 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.h
+++ b/clang/lib/Driver/ToolChains/Arch/X86.h
@@ -26,7 +26,7 @@ std::string getX86TargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
 
 void getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                           const llvm::opt::ArgList &Args,
-                          std::vector<llvm::StringRef> &Features, bool ForAS);
+                          std::vector<llvm::StringRef> &Features);
 
 } // end namespace x86
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8766d34eec538..0d6907b8e5c7a 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -528,7 +528,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     break;
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    x86::getX86TargetFeatures(D, Triple, Args, Features, ForAS);
+    x86::getX86TargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::hexagon:
     hexagon::getHexagonTargetFeatures(D, Triple, Args, Features);
diff --git a/clang/test/Driver/target-specific.s b/clang/test/Driver/target-specific.s
new file mode 100644
index 0000000000000..aa4fc73812099
--- /dev/null
+++ b/clang/test/Driver/target-specific.s
@@ -0,0 +1,12 @@
+/// Check that we report a warning instead of an error for target-specific compilation only options.
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard -c %s 2>&1 | FileCheck %s
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard -c -fno-integrated-as %s 2>&1 | FileCheck %s
+
+/// Report a warning if we perform the link phase.
+// RUN: %clang -### --target=aarch64 -faddrsig -mbranch-protection=standard %s 2>&1 | FileCheck %s
+
+// CHECK: warning: argument unused during compilation: '-faddrsig'
+// CHECK: warning: argument unused during compilation: '-mbranch-protection=standard'
+
+/// assembler-with-cpp claims compile only options. Ideally we should emit a warning.
+// RUN: %clang -### -Werror --target=aarch64 -c -faddrsig -mbranch-protection=standard -x assembler-with-cpp %s
diff --git a/clang/test/Driver/x86-mfpmath.c b/clang/test/Driver/x86-mfpmath.c
index 7df594477a92c..8f85cced953ab 100644
--- a/clang/test/Driver/x86-mfpmath.c
+++ b/clang/test/Driver/x86-mfpmath.c
@@ -1,5 +1,5 @@
 // RUN: %clang -### -c --target=x86_64 -mfpmath=sse %s 2>&1 | FileCheck %s
 // CHECK: "-mfpmath" "sse"
 
-/// Don't warn for assembler input.
-// RUN: %clang -### -Werror -c --target=x86_64 -mfpmath=sse -x assembler %s 2>&1 | FileCheck /dev/null --implicit-check-not='"-mfpmath"'
+// RUN: %clang -### -c --target=x86_64 -mfpmath=sse -x assembler %s 2>&1 | FileCheck %s --check-prefix=WARN
+// WARN: warning: argument unused during compilation: '-mfpmath=sse'

From ecea6d6828d6591c80285ffb5c2be1312c201549 Mon Sep 17 00:00:00 2001
From: Vassil Vassilev <v.g.vassilev@gmail.com>
Date: Fri, 1 Sep 2023 19:50:54 +0000
Subject: [PATCH 84/92] [CodeGen] First check the kind and then the
 llvm::Function properties.

This patch fixes valgrind reports from downstream consumers about conditional
jump over uninitialised memory.

The original report:

```[ RUN      ] ScopeReflectionTest.IsComplete
==987150== Conditional jump or move depends on uninitialised value(s)
==987150==    at 0x1E1128F: clang::CodeGen::CodeGenModule::SetLLVMFunctionAttributesForDefinition(clang::Decl const*, llvm::Function*) (CodeGenModule.cpp:2391)
==987150==    by 0x1E4F181: clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (CodeGenModule.cpp:5669)
==987150==    by 0x1E4A194: clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (CodeGenModule.cpp:3909)
==987150==    by 0x1E4A752: clang::CodeGen::CodeGenModule::EmitGlobal(clang::GlobalDecl) (CodeGenModule.cpp:3649)
==987150==    by 0x1E532F5: clang::CodeGen::CodeGenModule::EmitTopLevelDecl(clang::Decl*) [clone .part.0] (CodeGenModule.cpp:6563)
==987150==    by 0x1B0BEDD: (anonymous namespace)::CodeGeneratorImpl::HandleTopLevelDecl(clang::DeclGroupRef) (ModuleBuilder.cpp:190)
==987150==    by 0x1AEA47B: clang::BackendConsumer::HandleTopLevelDecl(clang::DeclGroupRef) (CodeGenAction.cpp:235)
==987150==    by 0x101B02F: clang::IncrementalASTConsumer::HandleTopLevelDecl(clang::DeclGroupRef) (IncrementalParser.cpp:52)
==987150==    by 0x101ED93: clang::IncrementalParser::ParseOrWrapTopLevelDecl() (IncrementalParser.cpp:276)
==987150==    by 0x101FBBC: clang::IncrementalParser::Parse(llvm::StringRef) (IncrementalParser.cpp:342)
==987150==    by 0x100E104: clang::Interpreter::Parse(llvm::StringRef) (Interpreter.cpp:360)
==987150==    by 0xE734C0: Cpp::Interpreter::Parse(llvm::StringRef) (CppInterOpInterpreter.h:172)
==987150==  Uninitialised value was created by a heap allocation
==987150==    at 0x844BE63: operator new(unsigned long) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==987150==    by 0x1B0C882: StartModule (ModuleBuilder.cpp:139)
==987150==    by 0x1B0C882: clang::CodeGenerator::StartModule(llvm::StringRef, llvm::LLVMContext&) (ModuleBuilder.cpp:360)
==987150==    by 0x101C4AF: clang::IncrementalParser::GenModule() (IncrementalParser.cpp:372)
==987150==    by 0x101FC0E: clang::IncrementalParser::Parse(llvm::StringRef) (IncrementalParser.cpp:362)
==987150==    by 0x100E104: clang::Interpreter::Parse(llvm::StringRef) (Interpreter.cpp:360)
==987150==    by 0x100E243: clang::Interpreter::create(std::unique_ptr<clang::CompilerInstance, std::default_delete<clang::CompilerInstance> >) (Interpreter.cpp:279)
==987150==    by 0xF2131A: compat::createClangInterpreter(std::vector<char const*, std::allocator<char const*> >&) (Compatibility.h:123)
==987150==    by 0xF22AB9: Cpp::Interpreter::Interpreter(int, char const* const*, char const*, std::vector<std::shared_ptr<clang::ModuleFileExtension>, std::allocator<std::shared_ptr<clang::ModuleFileExtension> > > const&, void*, bool) (CppInterOpInterpreter.h:146)
==987150==    by 0xF1827A: CreateInterpreter (CppInterOp.cpp:2494)
==987150==    by 0xECFA0E: TestUtils::GetAllTopLevelDecls(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<clang::Decl*, std::allocator<clang::Decl*> >&, bool) (Utils.cpp:23)
==987150==    by 0xE9CB85: ScopeReflectionTest_IsComplete_Test::TestBody() (ScopeReflectionTest.cpp:71)
==987150==    by 0xF0ED0C: void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (in /home/vvassilev/workspace/builds/scratch/cppyy/InterOp/build-with-clang-repl-release/unittests/CppInterOp/CppInterOpTests)
==987150==
```

Differential revision: https://reviews.llvm.org/D159339

(cherry picked from commit 92246a9be0ba47788ada9621bef58ce7819be526)
---
 clang/lib/CodeGen/CodeGenModule.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a3506df7d4e5a..f09d1129b128a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2386,7 +2386,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   // functions. If the current target's C++ ABI requires this and this is a
   // member function, set its alignment accordingly.
   if (getTarget().getCXXABI().areMemberFunctionsAligned()) {
-    if (F->getPointerAlignment(getDataLayout()) < 2 && isa<CXXMethodDecl>(D))
+    if (isa<CXXMethodDecl>(D) && F->getPointerAlignment(getDataLayout()) < 2)
       F->setAlignment(std::max(llvm::Align(2), F->getAlign().valueOrOne()));
   }
 

From 7150c6d4ee4090e6908ed65abd433f561c70877c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 27 Aug 2023 22:19:31 -0700
Subject: [PATCH 85/92] [sanitizer] scanf interceptor: fix write size for
 %mc/%mC/%mS

When the optional assignment-allocation character 'm' (Extension to the
ISO C standard) is present, we currently use internal_strlen(buf)+1 for
all of cCsS[ (D85350). Fix cCS to use the correct size.

Fix https://github.com/llvm/llvm-project/issues/61768

Reviewed By: #sanitizers, vitalybuka

Differential Revision: https://reviews.llvm.org/D158485

(cherry picked from commit beeb37a8f3275281be305d2d1afe35ca053e21c0)
---
 .../sanitizer_common_interceptors_format.inc  | 16 ++++++---
 .../sanitizer_format_interceptor_test.cpp     | 34 ++++++++++++++-----
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
index 220abb89c3beb..24485900644b3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_format.inc
@@ -340,11 +340,19 @@ static void scanf_common(void *ctx, int n_inputs, bool allowGnuMalloc,
       size = 0;
     }
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, argp, size);
-    // For %ms/%mc, write the allocated output buffer as well.
+    // For %mc/%mC/%ms/%m[/%mS, write the allocated output buffer as well.
     if (dir.allocate) {
-      char *buf = *(char **)argp;
-      if (buf)
-        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, internal_strlen(buf) + 1);
+      if (char *buf = *(char **)argp) {
+        if (dir.convSpecifier == 'c')
+          size = 1;
+        else if (dir.convSpecifier == 'C')
+          size = sizeof(wchar_t);
+        else if (dir.convSpecifier == 'S')
+          size = (internal_wcslen((wchar_t *)buf) + 1) * sizeof(wchar_t);
+        else  // 's' or '['
+          size = internal_strlen(buf) + 1;
+        COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, size);
+      }
     }
   }
 }
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
index fa52ccc1994f6..de96e573ab844 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_format_interceptor_test.cpp
@@ -9,14 +9,16 @@
 // Tests for *scanf interceptors implementation in sanitizer_common.
 //
 //===----------------------------------------------------------------------===//
+#include <wchar.h>
+
 #include <algorithm>
 #include <vector>
 
+#include "gtest/gtest.h"
 #include "interception/interception.h"
-#include "sanitizer_test_utils.h"
-#include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "gtest/gtest.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_test_utils.h"
 
 using namespace __sanitizer;
 
@@ -206,21 +208,35 @@ TEST(SanitizerCommonInterceptors, Scanf) {
 
 TEST(SanitizerCommonInterceptors, ScanfAllocate) {
   const char *buf = "123456";
+  const wchar_t *wbuf = L"123";
 
   // Can not use testScanf() because this case needs a valid pointer to a string
   // in the scanf argument.
+  {
+    std::vector<unsigned> scanf_sizes;
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mc", &buf);
+    verifyFormatResults("%mc", 2, scanf_sizes, {P, 1u});
+  }
+  {
+    std::vector<unsigned> scanf_sizes;
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mC", &wbuf);
+    verifyFormatResults("%mC", 2, scanf_sizes, {P, (unsigned)sizeof(wchar_t)});
+  }
   {
     std::vector<unsigned> scanf_sizes;
     testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%ms", &buf);
-    verifyFormatResults("%ms", 2, scanf_sizes,
-                        {P, (unsigned)(strlen(buf) + 1)});
+    verifyFormatResults("%ms", 2, scanf_sizes, {P, unsigned(strlen(buf) + 1)});
+    scanf_sizes.clear();
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%m[0-9]",
+               &buf);
+    verifyFormatResults("%m[0-9]", 2, scanf_sizes,
+                        {P, unsigned(strlen(buf) + 1)});
   }
-
   {
     std::vector<unsigned> scanf_sizes;
-    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mc", &buf);
-    verifyFormatResults("%mc", 2, scanf_sizes,
-                        {P, (unsigned)(strlen(buf) + 1)});
+    testScanf3((void *)&scanf_sizes, 2, /*allowGnuMalloc=*/false, "%mS", &wbuf);
+    verifyFormatResults("%mS", 2, scanf_sizes,
+                        {P, unsigned((wcslen(wbuf) + 1) * sizeof(wchar_t))});
   }
 }
 

From 175a130b9f84583267ba04c3298cd73b02d5b76a Mon Sep 17 00:00:00 2001
From: Elizabeth Andrews <elizabeth.andrews@intel.com>
Date: Mon, 14 Aug 2023 12:01:14 -0700
Subject: [PATCH 86/92] [NFC][Clang] Fix static analyzer concern about null
 value dereference

Differential Revision: https://reviews.llvm.org/D157554
---
 clang/lib/Sema/SemaExprCXX.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 423d5372a6f65..08a025a3c8001 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9072,8 +9072,10 @@ Sema::BuildExprRequirement(
     MultiLevelTemplateArgumentList MLTAL(Param, TAL.asArray(),
                                          /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
-    Expr *IDC = Param->getTypeConstraint()->getImmediatelyDeclaredConstraint();
-    ExprResult Constraint = SubstExpr(IDC, MLTAL);
+    const TypeConstraint *TC = Param->getTypeConstraint();
+    assert(TC && "Type Constraint cannot be null here");
+    ExprResult Constraint =
+        SubstExpr(TC->getImmediatelyDeclaredConstraint(), MLTAL);
     if (Constraint.isInvalid()) {
       Status = concepts::ExprRequirement::SS_ExprSubstitutionFailure;
     } else {

From c2c9c0f1388e435c4b2416d658ea005d5e724202 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Wed, 16 Aug 2023 15:33:58 +0800
Subject: [PATCH 87/92] [clang] Construct ExprRequirement with
 SubstitutionDiagnostic on SubstFailure

We're expecting a SubstitutionDiagnostic in diagnoseUnsatisfiedRequirement
if the status of ExprRequirement is SubstFailure. Previously, the Requirement
was created with Expr on SubstFailure by mistake, which could lead to the
assertion failure in the subsequent diagnosis.

Fixes https://github.com/clangd/clangd/issues/1726
Fixes https://github.com/llvm/llvm-project/issues/64723
Fixes https://github.com/llvm/llvm-project/issues/64172

In addition, this patch also fixes an invalid test from D129499.

Reviewed By: erichkeane

Differential Revision: https://reviews.llvm.org/D158061
---
 clang/docs/ReleaseNotes.rst                   |  4 ++
 clang/include/clang/AST/ExprConcepts.h        | 14 +++++--
 clang/lib/Sema/SemaExprCXX.cpp                | 25 ++++++++-----
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 17 ++++++++-
 .../SemaCXX/concept-crash-on-diagnostic.cpp   | 37 +++++++++++++++++++
 clang/test/SemaCXX/concept-fatal-error.cpp    |  4 +-
 6 files changed, 85 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/SemaCXX/concept-crash-on-diagnostic.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 76cc074dede76..456c724514369 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -852,6 +852,10 @@ Bug Fixes to C++ Support
 - Update ``FunctionDeclBitfields.NumFunctionDeclBits``. This fixes:
   (`#64171 <https://github.com/llvm/llvm-project/issues/64171>`_).
 
+- Fix a crash caused by substitution failure in expression requirements.
+  (`#64172 <https://github.com/llvm/llvm-project/issues/64172>`_) and
+  (`#64723 <https://github.com/llvm/llvm-project/issues/64723>`_).
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h
index d900e980852b4..13d4568119eb2 100644
--- a/clang/include/clang/AST/ExprConcepts.h
+++ b/clang/include/clang/AST/ExprConcepts.h
@@ -14,20 +14,21 @@
 #ifndef LLVM_CLANG_AST_EXPRCONCEPTS_H
 #define LLVM_CLANG_AST_EXPRCONCEPTS_H
 
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTConcept.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
-#include "clang/AST/DeclarationName.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/DeclarationName.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <utility>
 #include <string>
+#include <utility>
 
 namespace clang {
 class ASTStmtReader;
@@ -467,6 +468,13 @@ class NestedRequirement : public Requirement {
   }
 };
 
+using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
+
+/// \brief create a Requirement::SubstitutionDiagnostic with only a
+/// SubstitutedEntity and DiagLoc using Sema's allocator.
+Requirement::SubstitutionDiagnostic *
+createSubstDiagAt(Sema &S, SourceLocation Location, EntityPrinter Printer);
+
 } // namespace concepts
 
 /// C++2a [expr.prim.req]:
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 08a025a3c8001..1cff4a75790ec 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
@@ -9074,16 +9075,22 @@ Sema::BuildExprRequirement(
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
-    ExprResult Constraint =
-        SubstExpr(TC->getImmediatelyDeclaredConstraint(), MLTAL);
+    auto *IDC = TC->getImmediatelyDeclaredConstraint();
+    assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
+    ExprResult Constraint = SubstExpr(IDC, MLTAL);
     if (Constraint.isInvalid()) {
-      Status = concepts::ExprRequirement::SS_ExprSubstitutionFailure;
-    } else {
-      SubstitutedConstraintExpr =
-          cast<ConceptSpecializationExpr>(Constraint.get());
-      if (!SubstitutedConstraintExpr->isSatisfied())
-        Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
-    }
+      return new (Context) concepts::ExprRequirement(
+          concepts::createSubstDiagAt(*this, IDC->getExprLoc(),
+                                      [&](llvm::raw_ostream &OS) {
+                                        IDC->printPretty(OS, /*Helper=*/nullptr,
+                                                         getPrintingPolicy());
+                                      }),
+          IsSimple, NoexceptLoc, ReturnTypeRequirement);
+    }
+    SubstitutedConstraintExpr =
+        cast<ConceptSpecializationExpr>(Constraint.get());
+    if (!SubstitutedConstraintExpr->isSatisfied())
+      Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
   return new (Context) concepts::ExprRequirement(E, IsSimple, NoexceptLoc,
                                                  ReturnTypeRequirement, Status,
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8702e2ca3a1b3..394006a57747d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2276,9 +2276,9 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType(
       getPackIndex(Pack), Arg, TL.getNameLoc());
 }
 
-template<typename EntityPrinter>
 static concepts::Requirement::SubstitutionDiagnostic *
-createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
+createSubstDiag(Sema &S, TemplateDeductionInfo &Info,
+                concepts::EntityPrinter Printer) {
   SmallString<128> Message;
   SourceLocation ErrorLoc;
   if (Info.hasSFINAEDiagnostic()) {
@@ -2302,6 +2302,19 @@ createSubstDiag(Sema &S, TemplateDeductionInfo &Info, EntityPrinter Printer) {
       StringRef(MessageBuf, Message.size())};
 }
 
+concepts::Requirement::SubstitutionDiagnostic *
+concepts::createSubstDiagAt(Sema &S, SourceLocation Location,
+                            EntityPrinter Printer) {
+  SmallString<128> Entity;
+  llvm::raw_svector_ostream OS(Entity);
+  Printer(OS);
+  char *EntityBuf = new (S.Context) char[Entity.size()];
+  llvm::copy(Entity, EntityBuf);
+  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
+      /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()),
+      /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()};
+}
+
 ExprResult TemplateInstantiator::TransformRequiresTypeParams(
     SourceLocation KWLoc, SourceLocation RBraceLoc, const RequiresExpr *RE,
     RequiresExprBodyDecl *Body, ArrayRef<ParmVarDecl *> Params,
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
new file mode 100644
index 0000000000000..00a39f9f03b79
--- /dev/null
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s
+
+template <typename Iterator> class normal_iterator {};
+
+template <typename From, typename To> struct is_convertible {};
+
+template <typename From, typename To>
+inline constexpr bool is_convertible_v = is_convertible<From, To>::value; // expected-error {{no member named 'value' in 'is_convertible<bool, bool>'}}
+
+template <typename From, typename To>
+concept convertible_to = is_convertible_v<From, To>; // #1
+
+template <typename IteratorL, typename IteratorR>
+  requires requires(IteratorL lhs, IteratorR rhs) { // #2
+    { lhs == rhs } -> convertible_to<bool>; // #3
+  }
+constexpr bool compare(normal_iterator<IteratorL> lhs, normal_iterator<IteratorR> rhs) { // #4
+  return false;
+}
+
+class Object;
+
+void function() {
+  normal_iterator<Object *> begin, end;
+  compare(begin, end); // expected-error {{no matching function for call to 'compare'}} #5
+}
+
+// expected-note@#1 {{in instantiation of variable template specialization 'is_convertible_v<bool, bool>' requested here}}
+// expected-note@#1 {{substituting template arguments into constraint expression here}}
+// expected-note@#3 {{checking the satisfaction of concept 'convertible_to<bool, bool>'}}
+// expected-note@#2 {{substituting template arguments into constraint expression here}}
+// expected-note@#5 {{checking constraint satisfaction for template 'compare<Object *, Object *>'}}
+// expected-note@#5 {{in instantiation of function template specialization 'compare<Object *, Object *>' requested here}}
+
+// expected-note@#4 {{candidate template ignored: constraints not satisfied [with IteratorL = Object *, IteratorR = Object *]}}
+// We don't know exactly the substituted type for `lhs == rhs`, thus a placeholder 'expr-type' is emitted.
+// expected-note@#3 {{because 'convertible_to<expr-type, bool>' would be invalid}}
diff --git a/clang/test/SemaCXX/concept-fatal-error.cpp b/clang/test/SemaCXX/concept-fatal-error.cpp
index c299b39fdeb23..c606b9e21a364 100644
--- a/clang/test/SemaCXX/concept-fatal-error.cpp
+++ b/clang/test/SemaCXX/concept-fatal-error.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fsyntax-only -std=c++20 -ferror-limit 1 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ferror-limit 1 -verify %s
 
 template <class>
 concept f = requires { 42; };
@@ -6,5 +6,5 @@ struct h {
   // The missing semicolon will trigger an error and -ferror-limit=1 will make it fatal
   // We test that we do not crash in such cases (#55401)
   int i = requires { { i } f } // expected-error {{expected ';' at end of declaration list}}
-                               // expected-error@* {{too many errros emitted}}
+                               // expected-error@* {{too many errors emitted}}
 };

From 5b4513afa40647eab89c5a55ab2d1c893229e873 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Mon, 28 Aug 2023 14:51:59 +0800
Subject: [PATCH 88/92] [clang][clangd] Ensure the stack bottom before building
 AST

`clang::runWithSufficientStackSpace` requires the address of the
initial stack bottom to prevent potential stack overflows.

In addition, add a fallback to ASTFrontendAction in case any client
forgets to call it when not through CompilerInstance::ExecuteAction,
which is rare.

Fixes https://github.com/clangd/clangd/issues/1745.

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D158967

(cherry picked from commit e257c0a9190637e44e292271103a13d70bec4b03)
---
 clang-tools-extra/clangd/ClangdServer.cpp           |  4 +++-
 clang-tools-extra/clangd/TUScheduler.cpp            |  7 +++++++
 clang-tools-extra/clangd/index/Background.cpp       |  2 ++
 .../clangd/test/infinite-instantiation.test         | 13 +++++++++++++
 clang-tools-extra/clangd/tool/ClangdMain.cpp        |  4 ++++
 clang/lib/Frontend/FrontendAction.cpp               |  5 +++++
 6 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 clang-tools-extra/clangd/test/infinite-instantiation.test

diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index d44d1e272b9b7..8b542d0b2dec2 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -34,6 +34,7 @@
 #include "support/MemoryTree.h"
 #include "support/ThreadsafeFS.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Format/Format.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/CompilationDatabase.h"
@@ -52,8 +53,8 @@
 #include <optional>
 #include <string>
 #include <type_traits>
-#include <vector>
 #include <utility>
+#include <vector>
 
 namespace clang {
 namespace clangd {
@@ -112,6 +113,7 @@ struct UpdateIndexCallbacks : public ParsingCallbacks {
                  FIndex(FIndex),
                  // shared_ptr extends lifetime
                  Stdlib(Stdlib)]() mutable {
+      clang::noteBottomOfStack();
       IndexFileIn IF;
       IF.Symbols = indexStandardLibrary(std::move(CI), Loc, *TFS);
       if (Stdlib->isBest(LO))
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index dd2ce16147a5d..324ba1fc8cb89 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -63,6 +63,7 @@
 #include "support/ThreadCrashReporter.h"
 #include "support/Threading.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/FunctionExtras.h"
@@ -464,6 +465,10 @@ class PreambleThread {
   }
 
   void run() {
+    // We mark the current as the stack bottom so that clang running on this
+    // thread can notice the stack usage and prevent stack overflow with best
+    // efforts. Same applies to other calls thoughout clangd.
+    clang::noteBottomOfStack();
     while (true) {
       std::optional<PreambleThrottlerRequest> Throttle;
       {
@@ -1383,6 +1388,7 @@ void ASTWorker::startTask(llvm::StringRef Name,
 }
 
 void ASTWorker::run() {
+  clang::noteBottomOfStack();
   while (true) {
     {
       std::unique_lock<std::mutex> Lock(Mutex);
@@ -1777,6 +1783,7 @@ void TUScheduler::runWithPreamble(llvm::StringRef Name, PathRef File,
                Ctx = Context::current().derive(FileBeingProcessed,
                                                std::string(File)),
                Action = std::move(Action), this]() mutable {
+    clang::noteBottomOfStack();
     ThreadCrashReporter ScopedReporter([&Name, &Contents, &Command]() {
       llvm::errs() << "Signalled during preamble action: " << Name << "\n";
       crashDumpCompileCommand(llvm::errs(), Command);
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index c35de750435cc..7ef9511cf7c07 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -30,6 +30,7 @@
 #include "support/Trace.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
@@ -108,6 +109,7 @@ BackgroundIndex::BackgroundIndex(
   for (unsigned I = 0; I < Opts.ThreadPoolSize; ++I) {
     ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1),
                         [this, Ctx(Context::current().clone())]() mutable {
+                          clang::noteBottomOfStack();
                           WithContext BGContext(std::move(Ctx));
                           Queue.work([&] { Rebuilder.idle(); });
                         });
diff --git a/clang-tools-extra/clangd/test/infinite-instantiation.test b/clang-tools-extra/clangd/test/infinite-instantiation.test
new file mode 100644
index 0000000000000..85a1b656f4908
--- /dev/null
+++ b/clang-tools-extra/clangd/test/infinite-instantiation.test
@@ -0,0 +1,13 @@
+// RUN: cp %s %t.cpp
+// RUN: not clangd -check=%t.cpp 2>&1 | FileCheck -strict-whitespace %s
+
+// CHECK: [template_recursion_depth_exceeded]
+
+template <typename... T>
+constexpr int f(T... args) {
+  return f(0, args...);
+}
+
+int main() {
+  auto i = f();
+}
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index ca5cced197cd2..f656a8c587c65 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -29,6 +29,7 @@
 #include "support/ThreadCrashReporter.h"
 #include "support/ThreadsafeFS.h"
 #include "support/Trace.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -710,6 +711,9 @@ enum class ErrorResultCode : int {
 };
 
 int clangdMain(int argc, char *argv[]) {
+  // Clang could run on the main thread. e.g., when the flag '-check' or '-sync'
+  // is enabled.
+  clang::noteBottomOfStack();
   llvm::InitializeAllTargetInfos();
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
   llvm::sys::AddSignalHandler(
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index c6f958a6077bf..0bd4b01ff79db 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/FileEntry.h"
 #include "clang/Basic/LangStandard.h"
 #include "clang/Basic/Sarif.h"
+#include "clang/Basic/Stack.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -1150,6 +1151,10 @@ void ASTFrontendAction::ExecuteAction() {
   CompilerInstance &CI = getCompilerInstance();
   if (!CI.hasPreprocessor())
     return;
+  // This is a fallback: If the client forgets to invoke this, we mark the
+  // current stack as the bottom. Though not optimal, this could help prevent
+  // stack overflow during deep recursion.
+  clang::noteBottomOfStack();
 
   // FIXME: Move the truncation aspect of this into Sema, we delayed this till
   // here so the source manager would be initialized.

From 9597525dc133dd05a4909ac4e2f18ee946cd5785 Mon Sep 17 00:00:00 2001
From: Takuya Shimizu <shimizu2486@gmail.com>
Date: Mon, 4 Sep 2023 11:19:46 +0900
Subject: [PATCH 89/92] Revert "[clang][Docs] Added release note for D142609"

The associated commit was reverted and backported in a93ca35, so this
release note line should also be removed.

This reverts commit 061e855767dbe0821d81a8d47158f468dd00ae5f.
---
 clang/docs/ReleaseNotes.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 456c724514369..b161be3a07752 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -477,9 +477,6 @@ Improvements to Clang's diagnostics
 - ``-Wformat`` will no longer suggest a no-op fix-it for fixing scoped enum format
   warnings. Instead, it will suggest casting the enum object to the type specified
   in the format string.
-- Clang now emits ``-Wconstant-logical-operand`` warning even when constant logical
-  operand is on left side.
-  (`#37919 <https://github.com/llvm/llvm-project/issues/37919>`_)
 - Clang contexpr evaluator now displays notes as well as an error when a constructor
   of a base class is not called in the constructor of its derived class.
 

From ae9c45b112598aa9a411ffe4cebd213d7f9186b6 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Sun, 3 Sep 2023 12:18:57 +0800
Subject: [PATCH 90/92] [JumpThreading][NFC] Pre-commit for invalid LVI.

(cherry picked from commit 5855a4be9cbc3c4584d8a1632886c347044dfbef)
---
 .../JumpThreading/invalidate-lvi.ll           | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 llvm/test/Transforms/JumpThreading/invalidate-lvi.ll

diff --git a/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
new file mode 100644
index 0000000000000..9c5cbfac62d9f
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+
+declare void @set_value(ptr)
+
+declare void @bar()
+
+define void @foo(i1 %0) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i1 [[TMP0:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[V:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @set_value(ptr [[V]])
+; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[V]], align 8
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB0:%.*]], label [[BB4:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[L1]], 0
+; CHECK-NEXT:    br i1 [[C1]], label [[BB1:%.*]], label [[BB4]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i64 0, ptr [[V]], align 8
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[L1]], [[BB0]] ], [ [[L1]], [[START:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+start:
+  %v = alloca i64, align 8
+  call void @set_value(ptr %v)
+  %l1 = load i64, ptr %v, align 8, !range !0
+  br i1 %0, label %bb0, label %bb2
+
+bb0:                                              ; preds = %start
+  %c1 = icmp eq i64 %l1, 0
+  br i1 %c1, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb0
+  store i64 0, ptr %v, align 8
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb0, %start
+  %l2 = load i64, ptr %v, align 8
+  %1 = icmp eq i64 %l2, 2
+  br i1 %1, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb2
+  call void @bar()
+  ret void
+
+bb4:                                              ; preds = %bb2
+  ret void
+}
+
+!0 = !{i64 0, i64 2}

From dccf183bb26073c6b818ea24ec31ff87563d73e2 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Sun, 3 Sep 2023 12:23:33 +0800
Subject: [PATCH 91/92] [JumpThreading] Invalidate LVI after
 `combineMetadataForCSE`.

(cherry picked from commit 7ded71b1e43fff0be3acb74038bfea87f38d5cfa)
---
 llvm/include/llvm/Analysis/LazyValueInfo.h       |  3 +++
 llvm/lib/Analysis/LazyValueInfo.cpp              |  9 +++++++++
 llvm/lib/Transforms/Scalar/JumpThreading.cpp     |  2 ++
 .../Transforms/JumpThreading/invalidate-lvi.ll   | 16 +++++++++++-----
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LazyValueInfo.h b/llvm/include/llvm/Analysis/LazyValueInfo.h
index b109b7f7e65ae..7b2bfdac75a8f 100644
--- a/llvm/include/llvm/Analysis/LazyValueInfo.h
+++ b/llvm/include/llvm/Analysis/LazyValueInfo.h
@@ -115,6 +115,9 @@ class LazyValueInfo {
   /// PredBB to OldSucc to be from PredBB to NewSucc instead.
   void threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc);
 
+  /// Remove information related to this value from the cache.
+  void forgetValue(Value *V);
+
   /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 33651783cb177..2ba6036056d99 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -465,6 +465,10 @@ class LazyValueInfoImpl {
     F.print(OS, &Writer);
   }
 
+  /// This is part of the update interface to remove information related to this
+  /// value from the cache.
+  void forgetValue(Value *V) { TheCache.eraseValue(V); }
+
   /// This is part of the update interface to inform the cache
   /// that a block has been deleted.
   void eraseBlock(BasicBlock *BB) {
@@ -1969,6 +1973,11 @@ void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
   }
 }
 
+void LazyValueInfo::forgetValue(Value *V) {
+  if (PImpl)
+    getImpl(PImpl, AC, nullptr).forgetValue(V);
+}
+
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
   if (PImpl) {
     getImpl(PImpl, AC, BB->getModule()).eraseBlock(BB);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 24390f1b54f60..5b8f1b00dc034 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1269,6 +1269,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     if (IsLoadCSE) {
       LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
       combineMetadataForCSE(NLoadI, LoadI, false);
+      LVI->forgetValue(NLoadI);
     };
 
     // If the returned value is the load itself, replace with poison. This can
@@ -1461,6 +1462,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
   for (LoadInst *PredLoadI : CSELoads) {
     combineMetadataForCSE(PredLoadI, LoadI, true);
+    LVI->forgetValue(PredLoadI);
   }
 
   LoadI->replaceAllUsesWith(PN);
diff --git a/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
index 9c5cbfac62d9f..27191d6f54c2d 100644
--- a/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
+++ b/llvm/test/Transforms/JumpThreading/invalidate-lvi.ll
@@ -12,15 +12,21 @@ define void @foo(i1 %0) {
 ; CHECK-NEXT:    [[V:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    call void @set_value(ptr [[V]])
 ; CHECK-NEXT:    [[L1:%.*]] = load i64, ptr [[V]], align 8
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB0:%.*]], label [[BB4:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB0:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[L1]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[BB1:%.*]], label [[BB4]]
-; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[C1]], label [[BB2_THREAD:%.*]], label [[BB2]]
+; CHECK:       bb2.thread:
 ; CHECK-NEXT:    store i64 0, ptr [[V]], align 8
-; CHECK-NEXT:    br label [[BB4]]
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ [[L1]], [[BB0]] ], [ [[L1]], [[START:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[L2]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB3:%.*]], label [[BB4]]
+; CHECK:       bb3:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    ret void
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[L2:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[L1]], [[BB0]] ], [ [[L1]], [[START:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 start:

From 092b6c5ee3707ea10b9f10d0a674e8d12395369b Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 30 Aug 2023 12:23:25 +0200
Subject: [PATCH 92/92] [mlir][nfc] Allow ops to have operands/attributes named
 `context`.

This is probably a bad idea, but it's only become a problem with properties and is easy to fix.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D159185
---
 mlir/include/mlir/IR/OperationSupport.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index f3a79eb52f8ec..adae3560570dd 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -555,7 +555,7 @@ class RegisteredOperationName : public OperationName {
                                              StringRef name) final {
       if constexpr (hasProperties) {
         auto concreteOp = cast<ConcreteOp>(op);
-        return ConcreteOp::getInherentAttr(concreteOp.getContext(),
+        return ConcreteOp::getInherentAttr(concreteOp->getContext(),
                                            concreteOp.getProperties(), name);
       }
       // If the op does not have support for properties, we dispatch back to the
@@ -576,7 +576,7 @@ class RegisteredOperationName : public OperationName {
     void populateInherentAttrs(Operation *op, NamedAttrList &attrs) final {
       if constexpr (hasProperties) {
         auto concreteOp = cast<ConcreteOp>(op);
-        ConcreteOp::populateInherentAttrs(concreteOp.getContext(),
+        ConcreteOp::populateInherentAttrs(concreteOp->getContext(),
                                           concreteOp.getProperties(), attrs);
       }
     }