diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 6b950d05fb9bf9..6a886a49ea0762 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -648,7 +648,8 @@ elementwise to the input.
 Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±infinity
 
 The integer elementwise intrinsics, including ``__builtin_elementwise_popcount``,
-``__builtin_elementwise_bitreverse``, can be called in a ``constexpr`` context.
+``__builtin_elementwise_bitreverse``, ``__builtin_elementwise_add_sat``,
+``__builtin_elementwise_sub_sat`` can be called in a ``constexpr`` context.
 
 ============================================== ====================================================================== =========================================
          Name                                   Operation                                                             Supported element types
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ff1ee5ef811fa4..242aaf7ce4e475 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -416,12 +416,11 @@ Non-comprehensive list of changes in this release
   The flexible array member (FAM) can now be accessed immediately without causing
   issues with the sanitizer because the counter is automatically set.
 
-- ``__builtin_reduce_add`` function can now be used in constant expressions.
-- ``__builtin_reduce_mul`` function can now be used in constant expressions.
-- ``__builtin_reduce_and`` function can now be used in constant expressions.
-- ``__builtin_reduce_or`` and ``__builtin_reduce_xor`` functions can now be used in constant expressions.
-- ``__builtin_elementwise_popcount`` function can now be used in constant expressions.
-- ``__builtin_elementwise_bitreverse`` function can now be used in constant expressions.
+- The following builtins can now be used in constant expressions: ``__builtin_reduce_add``,
+  ``__builtin_reduce_mul``, ``__builtin_reduce_and``, ``__builtin_reduce_or``,
+  ``__builtin_reduce_xor``, ``__builtin_elementwise_popcount``,
+  ``__builtin_elementwise_bitreverse``, ``__builtin_elementwise_add_sat``,
+  ``__builtin_elementwise_sub_sat``.
 
 New Compiler Flags
 ------------------
@@ -828,6 +827,7 @@ Bug Fixes to C++ Support
 - Fixed a bug where bounds of partially expanded pack indexing expressions were checked too early. (#GH116105)
 - Fixed an assertion failure caused by using ``consteval`` in condition in consumed analyses. (#GH117385)
 - Fix a crash caused by incorrect argument position in merging deduced template arguments. (#GH113659)
+- Fixed a parser crash when using pack indexing as a nested name specifier. (#GH119072) 
 - Fixed an assertion failure caused by mangled names with invalid identifiers. (#GH112205)
 - Fixed an incorrect lambda scope of generic lambdas that caused Clang to crash when computing potential lambda
   captures at the end of a full expression. (#GH115931)
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index e2c3d3c535571c..32a09e2ceb3857 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1450,13 +1450,13 @@ def ElementwiseFma : Builtin {
 
 def ElementwiseAddSat : Builtin {
   let Spellings = ["__builtin_elementwise_add_sat"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
 def ElementwiseSubSat : Builtin {
   let Spellings = ["__builtin_elementwise_sub_sat"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 6b5b95aee35522..86313fbde0b4a7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11339,6 +11339,37 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case Builtin::BI__builtin_elementwise_add_sat:
+  case Builtin::BI__builtin_elementwise_sub_sat: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+      APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt();
+      APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt();
+      switch (E->getBuiltinCallee()) {
+      case Builtin::BI__builtin_elementwise_add_sat:
+        ResultElements.push_back(APValue(
+            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : RHS.uadd_sat(RHS),
+                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+        break;
+      case Builtin::BI__builtin_elementwise_sub_sat:
+        ResultElements.push_back(APValue(
+            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : RHS.usub_sat(RHS),
+                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   }
 }
 
@@ -13204,6 +13235,25 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(Val.rotr(Amt.urem(Val.getBitWidth())), E);
   }
 
+  case Builtin::BI__builtin_elementwise_add_sat: {
+    APSInt LHS, RHS;
+    if (!EvaluateInteger(E->getArg(0), LHS, Info) ||
+        !EvaluateInteger(E->getArg(1), RHS, Info))
+      return false;
+
+    APInt Result = LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS);
+    return Success(APSInt(Result, !LHS.isSigned()), E);
+  }
+  case Builtin::BI__builtin_elementwise_sub_sat: {
+    APSInt LHS, RHS;
+    if (!EvaluateInteger(E->getArg(0), LHS, Info) ||
+        !EvaluateInteger(E->getArg(1), RHS, Info))
+      return false;
+
+    APInt Result = LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
+    return Success(APSInt(Result, !LHS.isSigned()), E);
+  }
+
   case Builtin::BIstrlen:
   case Builtin::BIwcslen:
     // A call to strlen is not a constant expression.
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 736484ded8383c..8dd72db8f5b4a2 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1199,7 +1199,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
         // If the token is not annotated, then it might be an expression pack
         // indexing
         if (!TryAnnotateTypeOrScopeToken() &&
-            Tok.is(tok::annot_pack_indexing_type))
+            Tok.isOneOf(tok::annot_pack_indexing_type, tok::annot_cxxscope))
           return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
                                      isVectorLiteral, NotPrimaryExpression);
       }
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index 82f82dd1ed7944..7f6b5f26eb9307 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -112,7 +112,7 @@ void test_builtin_elementwise_add_sat(float f1, float f2, double d1, double d2,
   // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
   int_as_one = __builtin_elementwise_add_sat(int_as_one, b);
 
-  // CHECK: call i32 @llvm.sadd.sat.i32(i32 1, i32 97)
+  // CHECK: store i64 98, ptr %i1.addr, align 8
   i1 = __builtin_elementwise_add_sat(1, 'a');
 }
 
@@ -165,7 +165,7 @@ void test_builtin_elementwise_sub_sat(float f1, float f2, double d1, double d2,
   // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
   int_as_one = __builtin_elementwise_sub_sat(int_as_one, b);
 
-  // CHECK: call i32 @llvm.ssub.sat.i32(i32 1, i32 97)
+  // CHECK: store i64 -96, ptr %i1.addr, align 8
   i1 = __builtin_elementwise_sub_sat(1, 'a');
 }
 
diff --git a/clang/test/Driver/aarch64-fujitsu-monaka.c b/clang/test/Driver/aarch64-fujitsu-monaka.c
new file mode 100644
index 00000000000000..df96b36bace681
--- /dev/null
+++ b/clang/test/Driver/aarch64-fujitsu-monaka.c
@@ -0,0 +1,13 @@
+// RUN: %clang --target=aarch64 -mcpu=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=fujitsu-monaka %s
+// RUN: %clang --target=aarch64 -mlittle-endian -mcpu=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=fujitsu-monaka %s
+// RUN: %clang --target=aarch64 -mtune=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=fujitsu-monaka-TUNE %s
+// RUN: %clang --target=aarch64 -mlittle-endian -mtune=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=fujitsu-monaka-TUNE %s
+// fujitsu-monaka: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "fujitsu-monaka"
+// fujitsu-monaka-TUNE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang --target=arm64 -mcpu=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-fujitsu-monaka %s
+// RUN: %clang --target=arm64 -mlittle-endian -mcpu=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-fujitsu-monaka %s
+// RUN: %clang --target=arm64 -mtune=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-fujitsu-monaka-TUNE %s
+// RUN: %clang --target=arm64 -mlittle-endian -mtune=fujitsu-monaka -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-fujitsu-monaka-TUNE %s
+// ARM64-fujitsu-monaka: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "fujitsu-monaka"
+// ARM64-fujitsu-monaka-TUNE: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"
diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
new file mode 100644
index 00000000000000..3c74e3620df034
--- /dev/null
+++ b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c
@@ -0,0 +1,82 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=fujitsu-monaka | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s
+
+// CHECK: Extensions enabled for the given AArch64 target
+// CHECK-EMPTY:
+// CHECK-NEXT:     Architecture Feature(s)                                Description
+// CHECK-NEXT:     FEAT_AES, FEAT_PMULL                                   Enable AES support
+// CHECK-NEXT:     FEAT_AMUv1                                             Enable Armv8.4-A Activity Monitors extension
+// CHECK-NEXT:     FEAT_AMUv1p1                                           Enable Armv8.6-A Activity Monitors Virtualization support
+// CHECK-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// CHECK-NEXT:     FEAT_BF16                                              Enable BFloat16 Extension
+// CHECK-NEXT:     FEAT_BTI                                               Enable Branch Target Identification
+// CHECK-NEXT:     FEAT_CCIDX                                             Enable Armv8.3-A Extend of the CCSIDR number of sets
+// CHECK-NEXT:     FEAT_CLRBHB                                            Enable Clear BHB instruction
+// CHECK-NEXT:     FEAT_CRC32                                             Enable Armv8.0-A CRC-32 checksum instructions
+// CHECK-NEXT:     FEAT_CSV2_2                                            Enable architectural speculation restriction
+// CHECK-NEXT:     FEAT_DIT                                               Enable Armv8.4-A Data Independent Timing instructions
+// CHECK-NEXT:     FEAT_DPB                                               Enable Armv8.2-A data Cache Clean to Point of Persistence
+// CHECK-NEXT:     FEAT_DPB2                                              Enable Armv8.5-A Cache Clean to Point of Deep Persistence
+// CHECK-NEXT:     FEAT_DotProd                                           Enable dot product support
+// CHECK-NEXT:     FEAT_ECV                                               Enable enhanced counter virtualization extension
+// CHECK-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// CHECK-NEXT:     FEAT_FAMINMAX                                          Enable FAMIN and FAMAX instructions
+// CHECK-NEXT:     FEAT_FCMA                                              Enable Armv8.3-A Floating-point complex number support
+// CHECK-NEXT:     FEAT_FGT                                               Enable fine grained virtualization traps extension
+// CHECK-NEXT:     FEAT_FHM                                               Enable FP16 FML instructions
+// CHECK-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// CHECK-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// CHECK-NEXT:     FEAT_FP8                                               Enable FP8 instructions
+// CHECK-NEXT:     FEAT_FP8DOT2                                           Enable FP8 2-way dot instructions
+// CHECK-NEXT:     FEAT_FP8DOT4                                           Enable FP8 4-way dot instructions
+// CHECK-NEXT:     FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
+// CHECK-NEXT:     FEAT_FPAC                                              Enable Armv8.3-A Pointer Authentication Faulting enhancement
+// CHECK-NEXT:     FEAT_FRINTTS                                           Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int
+// CHECK-NEXT:     FEAT_FlagM                                             Enable Armv8.4-A Flag Manipulation instructions
+// CHECK-NEXT:     FEAT_FlagM2                                            Enable alternative NZCV format for floating point comparisons
+// CHECK-NEXT:     FEAT_HBC                                               Enable Armv8.8-A Hinted Conditional Branches Extension
+// CHECK-NEXT:     FEAT_HCX                                               Enable Armv8.7-A HCRX_EL2 system register
+// CHECK-NEXT:     FEAT_I8MM                                              Enable Matrix Multiply Int8 Extension
+// CHECK-NEXT:     FEAT_JSCVT                                             Enable Armv8.3-A JavaScript FP conversion instructions
+// CHECK-NEXT:     FEAT_LOR                                               Enable Armv8.1-A Limited Ordering Regions extension
+// CHECK-NEXT:     FEAT_LRCPC                                             Enable support for RCPC extension
+// CHECK-NEXT:     FEAT_LRCPC2                                            Enable Armv8.4-A RCPC instructions with Immediate Offsets
+// CHECK-NEXT:     FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA              Enable Armv8.7-A LD64B/ST64B Accelerator Extension
+// CHECK-NEXT:     FEAT_LSE                                               Enable Armv8.1-A Large System Extension (LSE) atomic instructions
+// CHECK-NEXT:     FEAT_LSE2                                              Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules
+// CHECK-NEXT:     FEAT_LUT                                               Enable Lookup Table instructions
+// CHECK-NEXT:     FEAT_MEC                                               Enable Memory Encryption Contexts Extension
+// CHECK-NEXT:     FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT:     FEAT_MPAM                                              Enable Armv8.4-A Memory system Partitioning and Monitoring extension
+// CHECK-NEXT:     FEAT_NMI, FEAT_GICv3_NMI                               Enable Armv8.8-A Non-maskable Interrupts
+// CHECK-NEXT:     FEAT_NV, FEAT_NV2                                      Enable Armv8.4-A Nested Virtualization Enchancement
+// CHECK-NEXT:     FEAT_PAN                                               Enable Armv8.1-A Privileged Access-Never extension
+// CHECK-NEXT:     FEAT_PAN2                                              Enable Armv8.2-A PAN s1e1R and s1e1W Variants
+// CHECK-NEXT:     FEAT_PAuth                                             Enable Armv8.3-A Pointer Authentication extension
+// CHECK-NEXT:     FEAT_PMUv3                                             Enable Armv8.0-A PMUv3 Performance Monitors extension
+// CHECK-NEXT:     FEAT_RAS, FEAT_RASv1p1                                 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions
+// CHECK-NEXT:     FEAT_RDM                                               Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions
+// CHECK-NEXT:     FEAT_RME                                               Enable Realm Management Extension
+// CHECK-NEXT:     FEAT_RNG                                               Enable Random Number generation instructions
+// CHECK-NEXT:     FEAT_SB                                                Enable Armv8.5-A Speculation Barrier
+// CHECK-NEXT:     FEAT_SEL2                                              Enable Armv8.4-A Secure Exception Level 2 extension
+// CHECK-NEXT:     FEAT_SHA1, FEAT_SHA256                                 Enable SHA1 and SHA256 support
+// CHECK-NEXT:     FEAT_SHA3, FEAT_SHA512                                 Enable SHA512 and SHA3 support
+// CHECK-NEXT:     FEAT_SM4, FEAT_SM3                                     Enable SM3 and SM4 support
+// CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
+// CHECK-NEXT:     FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
+// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
+// CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
+// CHECK-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// CHECK-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
+// CHECK-NEXT:     FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable SVE AES and quadword SVE polynomial multiply instructions
+// CHECK-NEXT:     FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
+// CHECK-NEXT:     FEAT_SVE_SHA3                                          Enable SHA3 SVE2 instructions
+// CHECK-NEXT:     FEAT_SVE_SM4                                           Enable SM4 SVE2 instructions
+// CHECK-NEXT:     FEAT_TLBIOS, FEAT_TLBIRANGE                            Enable Armv8.4-A TLB Range and Maintenance instructions
+// CHECK-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+// CHECK-NEXT:     FEAT_TRF                                               Enable Armv8.4-A Trace extension
+// CHECK-NEXT:     FEAT_UAO                                               Enable Armv8.2-A UAO PState
+// CHECK-NEXT:     FEAT_VHE                                               Enable Armv8.1-A Virtual Host extension
+// CHECK-NEXT:     FEAT_WFxT                                              Enable Armv8.7-A WFET and WFIT instruction
+// CHECK-NEXT:     FEAT_XS                                                Enable Armv8.7-A limited-TLB-maintenance instruction
\ No newline at end of file
diff --git a/clang/test/Misc/target-invalid-cpu-note/aarch64.c b/clang/test/Misc/target-invalid-cpu-note/aarch64.c
index ab83f299ac5997..e6ff09557fe070 100644
--- a/clang/test/Misc/target-invalid-cpu-note/aarch64.c
+++ b/clang/test/Misc/target-invalid-cpu-note/aarch64.c
@@ -67,6 +67,7 @@
 // CHECK-SAME: {{^}}, exynos-m4
 // CHECK-SAME: {{^}}, exynos-m5
 // CHECK-SAME: {{^}}, falkor
+// CHECK-SAME: {{^}}, fujitsu-monaka
 // CHECK-SAME: {{^}}, generic
 // CHECK-SAME: {{^}}, grace
 // CHECK-SAME: {{^}}, kryo
diff --git a/clang/test/Parser/cxx2c-pack-indexing.cpp b/clang/test/Parser/cxx2c-pack-indexing.cpp
index c279bdd7af8c44..99347a2f8f1571 100644
--- a/clang/test/Parser/cxx2c-pack-indexing.cpp
+++ b/clang/test/Parser/cxx2c-pack-indexing.cpp
@@ -74,3 +74,12 @@ struct SS {
     }
 };
 }
+
+namespace GH119072 {
+
+template<typename... Ts>
+void foo() {
+  decltype(Ts...[0]::t) value;
+}
+
+}
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index 45c729f76418d1..b2f56e5a87ab1a 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -822,3 +822,19 @@ static_assert(__builtin_elementwise_bitreverse(0x12345678) == 0x1E6A2C48);
 static_assert(__builtin_elementwise_bitreverse(0x0123456789ABCDEFULL) == 0xF7B3D591E6A2C480);
 static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_bitreverse((vector4char){1, 2, 4, 8})) == (LITTLE_END ? 0x10204080 : 0x80402010));
 static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_bitreverse((vector4short){1, 2, 4, 8})) == (LITTLE_END ? 0x1000200040008000 : 0x8000400020001000));
+
+static_assert(__builtin_elementwise_add_sat(1, 2) == 3);
+static_assert(__builtin_elementwise_add_sat(1U, 2U) == 3U);
+static_assert(__builtin_elementwise_add_sat(~(1 << 31), 42) == ~(1 << 31));
+static_assert(__builtin_elementwise_add_sat((1 << 31), -42) == (1 << 31));
+static_assert(__builtin_elementwise_add_sat(~0U, 1U) == ~0U);
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_add_sat((vector4char){1, 2, 3, 4}, (vector4char){1, 2, 3, 4})) == (LITTLE_END ? 0x08060402 : 0x02040608));
+static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_add_sat((vector4short){(short)0x8000, (short)0x8001, (short)0x8002, (short)0x8003}, (vector4short){-7, -8, -9, -10}) == (LITTLE_END ? 0x8000800080008000 : 0x8000800080008000)));
+
+static_assert(__builtin_elementwise_sub_sat(1, 2) == -1);
+static_assert(__builtin_elementwise_sub_sat(2U, 1U) == 1U);
+static_assert(__builtin_elementwise_sub_sat(~(1 << 31), -42) == ~(1 << 31));
+static_assert(__builtin_elementwise_sub_sat((1 << 31), 42) == (1 << 31));
+static_assert(__builtin_elementwise_sub_sat(0U, 1U) == 0U);
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_sub_sat((vector4char){5, 4, 3, 2}, (vector4char){1, 1, 1, 1})) == (LITTLE_END ? 0x01020304 : 0x04030201));
+static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_sub_sat((vector4short){(short)0x8000, (short)0x8001, (short)0x8002, (short)0x8003}, (vector4short){7, 8, 9, 10}) == (LITTLE_END ? 0x8000800080008000 : 0x8000800080008000)));
diff --git a/compiler-rt/lib/orc/tests/unit/CMakeLists.txt b/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
index aec689a407be6e..16cd16c45683df 100644
--- a/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
+++ b/compiler-rt/lib/orc/tests/unit/CMakeLists.txt
@@ -2,6 +2,7 @@ set(UNITTEST_SOURCES
   adt_test.cpp
   bitmask_enum_test.cpp
   c_api_test.cpp
+  common.cpp
   endian_test.cpp
   error_test.cpp
   executor_address_test.cpp
diff --git a/compiler-rt/lib/orc/tests/unit/common.cpp b/compiler-rt/lib/orc/tests/unit/common.cpp
new file mode 100644
index 00000000000000..9479cc80943085
--- /dev/null
+++ b/compiler-rt/lib/orc/tests/unit/common.cpp
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+/// Defined so that tests can use code that logs errors.
+extern "C" void __orc_rt_log_error(const char *ErrMsg) {
+  fprintf(stderr, "orc runtime error: %s\n", ErrMsg);
+}
diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
index 3cea9772154fc6..18b0631324f8fb 100644
--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
@@ -26,7 +26,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
-
 struct FEnv {
   struct FPState {
     uint32_t ControlWord;
@@ -42,11 +41,11 @@ struct FEnv {
   static constexpr uint32_t DOWNWARD = 0x2;
   static constexpr uint32_t TOWARDZERO = 0x3;
 
-  static constexpr uint32_t INVALID = 0x1;
-  static constexpr uint32_t DIVBYZERO = 0x2;
-  static constexpr uint32_t OVERFLOW = 0x4;
-  static constexpr uint32_t UNDERFLOW = 0x8;
-  static constexpr uint32_t INEXACT = 0x10;
+  static constexpr uint32_t INVALID_F = 0x1;
+  static constexpr uint32_t DIVBYZERO_F = 0x2;
+  static constexpr uint32_t OVERFLOW_F = 0x4;
+  static constexpr uint32_t UNDERFLOW_F = 0x8;
+  static constexpr uint32_t INEXACT_F = 0x10;
 
   // Zero-th bit is the first bit.
   static constexpr uint32_t RoundingControlBitPosition = 22;
@@ -54,19 +53,19 @@ struct FEnv {
   static constexpr uint32_t ExceptionControlFlagsBitPosition = 8;
 
   LIBC_INLINE static uint32_t getStatusValueForExcept(int excepts) {
-    return ((excepts & FE_INVALID) ? INVALID : 0) |
-           ((excepts & FE_DIVBYZERO) ? DIVBYZERO : 0) |
-           ((excepts & FE_OVERFLOW) ? OVERFLOW : 0) |
-           ((excepts & FE_UNDERFLOW) ? UNDERFLOW : 0) |
-           ((excepts & FE_INEXACT) ? INEXACT : 0);
+    return ((excepts & FE_INVALID) ? INVALID_F : 0) |
+           ((excepts & FE_DIVBYZERO) ? DIVBYZERO_F : 0) |
+           ((excepts & FE_OVERFLOW) ? OVERFLOW_F : 0) |
+           ((excepts & FE_UNDERFLOW) ? UNDERFLOW_F : 0) |
+           ((excepts & FE_INEXACT) ? INEXACT_F : 0);
   }
 
   LIBC_INLINE static int exceptionStatusToMacro(uint32_t status) {
-    return ((status & INVALID) ? FE_INVALID : 0) |
-           ((status & DIVBYZERO) ? FE_DIVBYZERO : 0) |
-           ((status & OVERFLOW) ? FE_OVERFLOW : 0) |
-           ((status & UNDERFLOW) ? FE_UNDERFLOW : 0) |
-           ((status & INEXACT) ? FE_INEXACT : 0);
+    return ((status & INVALID_F) ? FE_INVALID : 0) |
+           ((status & DIVBYZERO_F) ? FE_DIVBYZERO : 0) |
+           ((status & OVERFLOW_F) ? FE_OVERFLOW : 0) |
+           ((status & UNDERFLOW_F) ? FE_UNDERFLOW : 0) |
+           ((status & INEXACT_F) ? FE_INEXACT : 0);
   }
 
   static uint32_t getControlWord() {
@@ -171,36 +170,36 @@ LIBC_INLINE int raise_except(int excepts) {
   uint32_t toRaise = FEnv::getStatusValueForExcept(excepts);
   int result = 0;
 
-  if (toRaise & FEnv::INVALID) {
+  if (toRaise & FEnv::INVALID_F) {
     divfunc(zero, zero);
     uint32_t statusWord = FEnv::getStatusWord();
     if (!((statusWord >> FEnv::ExceptionStatusFlagsBitPosition) &
-          FEnv::INVALID))
+          FEnv::INVALID_F))
       result = -1;
   }
 
-  if (toRaise & FEnv::DIVBYZERO) {
+  if (toRaise & FEnv::DIVBYZERO_F) {
     divfunc(one, zero);
     uint32_t statusWord = FEnv::getStatusWord();
     if (!((statusWord >> FEnv::ExceptionStatusFlagsBitPosition) &
-          FEnv::DIVBYZERO))
+          FEnv::DIVBYZERO_F))
       result = -1;
   }
-  if (toRaise & FEnv::OVERFLOW) {
+  if (toRaise & FEnv::OVERFLOW_F) {
     divfunc(largeValue, smallValue);
     uint32_t statusWord = FEnv::getStatusWord();
     if (!((statusWord >> FEnv::ExceptionStatusFlagsBitPosition) &
-          FEnv::OVERFLOW))
+          FEnv::OVERFLOW_F))
       result = -1;
   }
-  if (toRaise & FEnv::UNDERFLOW) {
+  if (toRaise & FEnv::UNDERFLOW_F) {
     divfunc(smallValue, largeValue);
     uint32_t statusWord = FEnv::getStatusWord();
     if (!((statusWord >> FEnv::ExceptionStatusFlagsBitPosition) &
-          FEnv::UNDERFLOW))
+          FEnv::UNDERFLOW_F))
       result = -1;
   }
-  if (toRaise & FEnv::INEXACT) {
+  if (toRaise & FEnv::INEXACT_F) {
     float two = 2.0f;
     float three = 3.0f;
     // 2.0 / 3.0 cannot be represented exactly in any radix 2 floating point
@@ -208,7 +207,7 @@ LIBC_INLINE int raise_except(int excepts) {
     divfunc(two, three);
     uint32_t statusWord = FEnv::getStatusWord();
     if (!((statusWord >> FEnv::ExceptionStatusFlagsBitPosition) &
-          FEnv::INEXACT))
+          FEnv::INEXACT_F))
       result = -1;
   }
   return result;
@@ -278,7 +277,6 @@ LIBC_INLINE int set_env(const fenv_t *envp) {
   FEnv::writeStatusWord(state->StatusWord);
   return 0;
 }
-
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c974f02b1049e3..348dffe2adfe99 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -3000,6 +3000,8 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   if (!ctx.arg.relocatable) {
     llvm::TimeTraceScope timeScope("Process symbol versions");
     ctx.symtab->scanVersionScript();
+
+    parseVersionAndComputeIsPreemptible(ctx);
   }
 
   // Skip the normal linked output if some LTO options are specified.
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 80ddc3c32cf5d1..1cdcf6be9d8a93 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -461,13 +461,6 @@ static void combineRelocHashes(unsigned cnt, InputSection *isec,
 
 // The main function of ICF.
 template <class ELFT> void ICF<ELFT>::run() {
-  // Compute isPreemptible early. We may add more symbols later, so this loop
-  // cannot be merged with the later computeIsPreemptible() pass which is used
-  // by scanRelocations().
-  if (ctx.arg.hasDynSymTab)
-    for (Symbol *sym : ctx.symtab->getSymbols())
-      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
-
   // Two text sections may have identical content and relocations but different
   // LSDA, e.g. the two functions may have catch blocks of different types. If a
   // text section is referenced by a .eh_frame FDE with LSDA, it is not
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 9ff6ed58688cef..2084fcfd4d651a 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -528,7 +528,7 @@ template <class ELFT> void ELFFileBase::init(InputFile::Kind k) {
     Fatal(ctx) << this << ": invalid sh_info in symbol table";
 
   elfSyms = reinterpret_cast<const void *>(eSyms.data());
-  numELFSyms = uint32_t(eSyms.size());
+  numSymbols = eSyms.size();
   stringTable = CHECK2(obj.getStringTableForSymtab(*symtabSec, sections), this);
 }
 
@@ -1089,10 +1089,8 @@ InputSectionBase *ObjFile<ELFT>::createInputSection(uint32_t idx,
 template <class ELFT>
 void ObjFile<ELFT>::initializeSymbols(const object::ELFFile<ELFT> &obj) {
   ArrayRef<Elf_Sym> eSyms = this->getELFSyms<ELFT>();
-  if (numSymbols == 0) {
-    numSymbols = eSyms.size();
+  if (!symbols)
     symbols = std::make_unique<Symbol *[]>(numSymbols);
-  }
 
   // Some entries have been filled by LazyObjFile.
   auto *symtab = ctx.symtab.get();
@@ -1431,6 +1429,7 @@ template <class ELFT> void SharedFile::parse() {
   const Elf_Shdr *versymSec = nullptr;
   const Elf_Shdr *verdefSec = nullptr;
   const Elf_Shdr *verneedSec = nullptr;
+  symbols = std::make_unique<Symbol *[]>(numSymbols);
 
   // Search for .dynsym, .dynamic, .symtab, .gnu.version and .gnu.version_d.
   for (const Elf_Shdr &sec : sections) {
@@ -1453,7 +1452,7 @@ template <class ELFT> void SharedFile::parse() {
     }
   }
 
-  if (versymSec && numELFSyms == 0) {
+  if (versymSec && numSymbols == 0) {
     ErrAlways(ctx) << "SHT_GNU_versym should be associated with symbol table";
     return;
   }
@@ -1496,7 +1495,7 @@ template <class ELFT> void SharedFile::parse() {
   // Parse ".gnu.version" section which is a parallel array for the symbol
   // table. If a given file doesn't have a ".gnu.version" section, we use
   // VER_NDX_GLOBAL.
-  size_t size = numELFSyms - firstGlobal;
+  size_t size = numSymbols - firstGlobal;
   std::vector<uint16_t> versyms(size, VER_NDX_GLOBAL);
   if (versymSec) {
     ArrayRef<Elf_Versym> versym =
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 79545c4bdeb5d0..0b186db1ba0d1d 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -53,7 +53,7 @@ class InputFile {
 
 protected:
   std::unique_ptr<Symbol *[]> symbols;
-  uint32_t numSymbols = 0;
+  size_t numSymbols = 0;
   SmallVector<InputSectionBase *, 0> sections;
 
 public:
@@ -208,7 +208,7 @@ class ELFFileBase : public InputFile {
   }
   template <typename ELFT> typename ELFT::SymRange getELFSyms() const {
     return typename ELFT::SymRange(
-        reinterpret_cast<const typename ELFT::Sym *>(elfSyms), numELFSyms);
+        reinterpret_cast<const typename ELFT::Sym *>(elfSyms), numSymbols);
   }
   template <typename ELFT> typename ELFT::SymRange getGlobalELFSyms() const {
     return getELFSyms<ELFT>().slice(firstGlobal);
@@ -225,7 +225,6 @@ class ELFFileBase : public InputFile {
   const void *elfShdrs = nullptr;
   const void *elfSyms = nullptr;
   uint32_t numELFShdrs = 0;
-  uint32_t numELFSyms = 0;
   uint32_t firstGlobal = 0;
 
   // Below are ObjFile specific members.
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 648da94989d7a4..d5b2200b502279 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -350,13 +350,6 @@ void SymbolTable::scanVersionScript() {
         assignAsterisk(pat, &v, true);
   }
 
-  // Symbol themselves might know their versions because symbols
-  // can contain versions in the form of <name>@<version>.
-  // Let them parse and update their names to exclude version suffix.
-  for (Symbol *sym : symVector)
-    if (sym->hasVersionSuffix)
-      sym->parseSymbolVersion(ctx);
-
   // isPreemptible is false at this point. To correctly compute the binding of a
   // Defined (which is used by includeInDynsym(ctx)), we need to know if it is
   // VER_NDX_LOCAL or not. Compute symbol versions before handling
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index dd530c59c3dc8e..b19381fe439c79 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -11,6 +11,7 @@
 #include "InputFiles.h"
 #include "InputSection.h"
 #include "OutputSections.h"
+#include "SymbolTable.h"
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "Writer.h"
@@ -345,7 +346,7 @@ bool elf::computeIsPreemptible(Ctx &ctx, const Symbol &sym) {
 
   // Only symbols with default visibility that appear in dynsym can be
   // preempted. Symbols with protected visibility cannot be preempted.
-  if (!sym.includeInDynsym(ctx) || sym.visibility() != STV_DEFAULT)
+  if (sym.visibility() != STV_DEFAULT)
     return false;
 
   // At this point copy relocations have not been created yet, so any
@@ -370,6 +371,20 @@ bool elf::computeIsPreemptible(Ctx &ctx, const Symbol &sym) {
   return true;
 }
 
+void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
+  // Symbol themselves might know their versions because symbols
+  // can contain versions in the form of <name>@<version>.
+  // Let them parse and update their names to exclude version suffix.
+  bool hasDynSymTab = ctx.arg.hasDynSymTab;
+  for (Symbol *sym : ctx.symtab->getSymbols()) {
+    if (sym->hasVersionSuffix)
+      sym->parseSymbolVersion(ctx);
+    if (hasDynSymTab)
+      sym->isPreemptible =
+          sym->includeInDynsym(ctx) && computeIsPreemptible(ctx, *sym);
+  }
+}
+
 // Merge symbol properties.
 //
 // When we have many symbols of the same name, we choose one of them,
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index 85a52e98f87a1a..1a53f3a1e15261 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -527,6 +527,7 @@ void reportDuplicate(Ctx &, const Symbol &sym, const InputFile *newFile,
                      InputSectionBase *errSec, uint64_t errOffset);
 void maybeWarnUnorderableSymbol(Ctx &, const Symbol *sym);
 bool computeIsPreemptible(Ctx &, const Symbol &sym);
+void parseVersionAndComputeIsPreemptible(Ctx &);
 
 } // namespace lld::elf
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 8cd16704ba3ee3..ea9692a2f731b2 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -297,8 +297,11 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
       }
     }
 
-    if (ctx.arg.hasDynSymTab)
-      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
+    if (ctx.arg.hasDynSymTab) {
+      sym->exportDynamic = sym->includeInDynsym(ctx);
+      sym->isPreemptible =
+          sym->exportDynamic && computeIsPreemptible(ctx, *sym);
+    }
   }
 }
 
@@ -1888,7 +1891,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
       if (ctx.in.symTab)
         ctx.in.symTab->addSymbol(sym);
 
-      if (sym->includeInDynsym(ctx)) {
+      if (sym->exportDynamic) {
         ctx.partitions[sym->partition - 1].dynSymTab->addSymbol(sym);
         if (auto *file = dyn_cast<SharedFile>(sym->file))
           if (file->isNeeded && !sym->isUndefined())
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index af9554085cacde..2da67126a17537 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -266,6 +266,14 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
                                  FeatureStorePairSuppress,
                                  FeaturePredictableSelectIsExpensive]>;
 
+def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
+                                 "Fujitsu FUJITSU-MONAKA processors", [
+                                 FeaturePredictableSelectIsExpensive,
+                                 FeatureEnableSelectOptimize,
+                                 FeaturePostRAScheduler,
+                                 FeatureArithmeticBccFusion,
+                                 ]>;
+
 def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
                                   "Nvidia Carmel processors">;
 
@@ -843,6 +851,12 @@ def ProcessorFeatures {
                                      FeatureSHA2, FeaturePerfMon, FeatureFullFP16,
                                      FeatureSVE, FeatureComplxNum,
                                      FeatureAES, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM];
+  list<SubtargetFeature> MONAKA = [HasV9_3aOps, FeaturePerfMon, FeatureCCIDX,
+                                   FeatureFPAC, FeatureFP16FML, FeatureRandGen,
+                                   FeatureSSBS, FeatureLS64, FeatureCLRBHB,
+                                   FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4,
+                                   FeatureSVE2SHA3, FeatureSVE2BitPerm, FeatureETE,
+                                   FeatureMEC, FeatureFP8DOT2];
   list<SubtargetFeature> Carmel   = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES,
                                      FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM,
                                      FeatureFPARMv8];
@@ -1225,6 +1239,10 @@ def : ProcessorAlias<"apple-latest", "apple-m4">;
 def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
                      [TuneA64FX]>;
 
+// Fujitsu FUJITSU-MONAKA
+def : ProcessorModel<"fujitsu-monaka", A64FXModel, ProcessorFeatures.MONAKA,
+                     [TuneMONAKA]>;
+
 // Nvidia Carmel
 def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
                      [TuneCarmel]>;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index e37e2cacc7852e..3767b34bd5b0c5 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -195,6 +195,9 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxPrefetchIterationsAhead = 4;
     VScaleForTuning = 4;
     break;
+  case MONAKA:
+    VScaleForTuning = 2;
+    break;
   case AppleA7:
   case AppleA10:
   case AppleA11:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6618650b346d65..e3b2e492e3d4fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -429,10 +429,10 @@ static cl::opt<bool>
                        cl::desc("Enable loop data prefetch on AMDGPU"),
                        cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EnableMaxIlpSchedStrategy(
-    "amdgpu-enable-max-ilp-scheduling-strategy",
-    cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
-    cl::Hidden, cl::init(false));
+static cl::opt<std::string>
+    AMDGPUSchedStrategy("amdgpu-sched-strategy",
+                        cl::desc("Select custom AMDGPU scheduling strategy."),
+                        cl::Hidden, cl::init(""));
 
 static cl::opt<bool> EnableRewritePartialRegUses(
     "amdgpu-enable-rewrite-partial-reg-uses",
@@ -569,6 +569,18 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   return DAG;
 }
 
+static ScheduleDAGInstrs *
+createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
+  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+  ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+      C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.shouldClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  return DAG;
+}
+
 static ScheduleDAGInstrs *
 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
@@ -609,6 +621,10 @@ static MachineSchedRegistry
     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
                            createGCNMaxILPMachineScheduler);
 
+static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
+    "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
+    createGCNMaxMemoryClauseMachineScheduler);
+
 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
     "gcn-iterative-max-occupancy-experimental",
     "Run GCN scheduler to maximize occupancy (experimental)",
@@ -1330,9 +1346,18 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
   if (ST.enableSIScheduler())
     return createSIMachineScheduler(C);
 
-  if (EnableMaxIlpSchedStrategy)
+  Attribute SchedStrategyAttr =
+      C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
+  StringRef SchedStrategy = SchedStrategyAttr.isValid()
+                                ? SchedStrategyAttr.getValueAsString()
+                                : AMDGPUSchedStrategy;
+
+  if (SchedStrategy == "max-ilp")
     return createGCNMaxILPMachineScheduler(C);
 
+  if (SchedStrategy == "max-memory-clause")
+    return createGCNMaxMemoryClauseMachineScheduler(C);
+
   return createGCNMaxOccupancyMachineScheduler(C);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..1c23b237eaf4be 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -615,6 +615,138 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
+GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
+    const MachineSchedContext *C)
+    : GCNSchedStrategy(C) {
+  SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
+}
+
+/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
+/// much as possible. This is achieved by:
+//  1. Prioritize clustered operations before stall latency heuristic.
+//  2. Prioritize long-latency-load before stall latency heuristic.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+///             if Cand is from a different zone than TryCand.
+/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
+bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
+                                                   SchedCandidate &TryCand,
+                                                   SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  if (DAG->isTrackingPressure()) {
+    // Avoid exceeding the target's limit.
+    if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                    RegExcess, TRI, DAG->MF))
+      return TryCand.Reason != NoCand;
+
+    // Avoid increasing the max critical pressure in the scheduled region.
+    if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                    TryCand, Cand, RegCritical, TRI, DAG->MF))
+      return TryCand.Reason != NoCand;
+  }
+
+  // MaxMemoryClause-specific: We prioritize clustered instructions as we would
+  // get more benefit from clausing these memory instructions.
+  const SUnit *CandNextClusterSU =
+      Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+      TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+    return TryCand.Reason != NoCand;
+
+  // We only compare a subset of features when comparing nodes between
+  // Top and Bottom boundary. Some properties are simply incomparable, in many
+  // other instances we should only override the other boundary if something
+  // is a clear good pick on one boundary. Skip heuristics that are more
+  // "tie-breaking" in nature.
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // For loops that are acyclic path limited, aggressively schedule for
+    // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+    // heuristics to take precedence.
+    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+        tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
+    // MaxMemoryClause-specific: Prioritize long latency memory load
+    // instructions in top-bottom order to hide more latency. The mayLoad check
+    // is used to exclude store-like instructions, which we do not want to
+    // scheduler them too early.
+    bool TryMayLoad =
+        TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
+    bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
+
+    if (TryMayLoad || CandMayLoad) {
+      bool TryLongLatency =
+          TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
+      bool CandLongLatency =
+          10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
+
+      if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
+                     Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
+                     Cand, Stall))
+        return TryCand.Reason != NoCand;
+    }
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return TryCand.Reason != NoCand;
+  }
+
+  if (SameBoundary) {
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+      return TryCand.Reason != NoCand;
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+                  Cand, RegMax, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  if (SameBoundary) {
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
+    // Fall through to original instruction order.
+    if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
+      assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
+      TryCand.Reason = NodeOrder;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -644,6 +776,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
     return std::make_unique<PreRARematStage>(SchedStageID, *this);
   case GCNSchedStageID::ILPInitialSchedule:
     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::MemoryClauseInitialSchedule:
+    return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
+                                                              *this);
   }
 
   llvm_unreachable("Unknown SchedStageID.");
@@ -869,6 +1004,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::ILPInitialSchedule:
     OS << "Max ILP Initial Schedule";
     break;
+  case GCNSchedStageID::MemoryClauseInitialSchedule:
+    OS << "Max memory clause Initial Schedule";
+    break;
   }
 
   return OS;
@@ -1088,7 +1226,8 @@ void GCNSchedStage::setupNewBlock() {
   // Get real RP for the region if it hasn't be calculated before. After the
   // initial schedule stage real RP will be collected after scheduling.
   if (StageID == GCNSchedStageID::OccInitialSchedule ||
-      StageID == GCNSchedStageID::ILPInitialSchedule)
+      StageID == GCNSchedStageID::ILPInitialSchedule ||
+      StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
 }
 
@@ -1389,6 +1528,11 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
   return false;
 }
 
+bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
+    unsigned WavesAfter) {
+  return mayCauseSpilling(WavesAfter);
+}
+
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
       !PressureAfter.less(MF, PressureBefore)) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..44db834a41f828 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -29,7 +29,8 @@ enum class GCNSchedStageID : unsigned {
   UnclusteredHighRPReschedule = 1,
   ClusteredLowOccupancyReschedule = 2,
   PreRARematerialize = 3,
-  ILPInitialSchedule = 4
+  ILPInitialSchedule = 4,
+  MemoryClauseInitialSchedule = 5
 };
 
 #ifndef NDEBUG
@@ -149,6 +150,17 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
   GCNMaxILPSchedStrategy(const MachineSchedContext *C);
 };
 
+/// The goal of this scheduling strategy is to maximize memory clause for a
+/// single wave.
+class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
+protected:
+  bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                    SchedBoundary *Zone) const override;
+
+public:
+  GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
+};
+
 class ScheduleMetrics {
   unsigned ScheduleLength;
   unsigned BubbleCycles;
@@ -463,6 +475,15 @@ class ILPInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class MemoryClauseInitialScheduleStage : public GCNSchedStage {
+public:
+  bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+  MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
+                                   GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
 class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
 private:
   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 56ce5e99914e16..03ef64cef3134e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -280,8 +280,9 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
 
   if (Implementer == "0x46") { // Fujitsu Ltd.
     return StringSwitch<const char *>(Part)
-      .Case("0x001", "a64fx")
-      .Default("generic");
+        .Case("0x001", "a64fx")
+        .Case("0x003", "fujitsu-monaka")
+        .Default("generic");
   }
 
   if (Implementer == "0x4e") { // NVIDIA Corporation
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 70a6331e34f45b..b65e1cb97d3bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -934,6 +934,11 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
     }
   }
 
+  if (DestWidth == 1 &&
+      (Trunc.hasNoUnsignedWrap() || Trunc.hasNoSignedWrap()) &&
+      isKnownNonZero(Src, SQ.getWithInstruction(&Trunc)))
+    return replaceInstUsesWith(Trunc, ConstantInt::getTrue(DestTy));
+
   bool Changed = false;
   if (!Trunc.hasNoSignedWrap() &&
       ComputeMaxSignificantBits(Src, /*Depth=*/0, &Trunc) <= DestWidth) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4a4553e4a8db8d..37118702762956 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7740,7 +7740,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.prepareToExecute(ILV.getTripCount(),
                              ILV.getOrCreateVectorTripCount(nullptr),
                              CanonicalIVStartValue, State);
-  VPlanTransforms::prepareToExecute(BestVPlan);
+  VPlanTransforms::convertToConcreteRecipes(BestVPlan);
 
   BestVPlan.execute(&State);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b438700a3fe2ce..5903ad29af7602 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3486,7 +3486,7 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPScalarPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  Value *Start = State.get(getOperand(0), VPLane(0));
+  Value *Start = State.get(getStartValue(), VPLane(0));
   PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
   Phi->addIncoming(Start, VectorPH);
   Phi->setDebugLoc(getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b536..6d77173735c9b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1820,9 +1820,7 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
-void VPlanTransforms::prepareToExecute(VPlan &Plan) {
-  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan.getVectorLoopRegion());
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04d5..9cf314a6a9f447 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -125,7 +125,7 @@ struct VPlanTransforms {
   static void removeDeadRecipes(VPlan &Plan);
 
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
-  static void prepareToExecute(VPlan &Plan);
+  static void convertToConcreteRecipes(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index a25763e3b15907..4c28ea75922024 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -733,5 +733,5 @@ define <4 x i16> @dup_i16_v4i16_constant() {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI50_0
 ; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI50_0]
 ; CHECK-GI-NEXT:    ret
-    ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>
+  ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>
 }
diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll
index 3e593a82fdf288..e9722f348f4113 100644
--- a/llvm/test/CodeGen/AArch64/cpus.ll
+++ b/llvm/test/CodeGen/AArch64/cpus.ll
@@ -36,6 +36,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=tsv110 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=apple-latest 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=fujitsu-monaka 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1a 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1b 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
new file mode 100644
index 00000000000000..a2ebdd28b16b8f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -0,0 +1,1698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for dup_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v2i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v3i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v3i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v3i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v4i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v4i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v4i128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v2fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v3fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v3fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v3fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v4fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v4fp128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v4fp128
+
+define <2 x i8> @dup_v2i8(i8 %a) {
+; CHECK-LABEL: dup_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2s, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @duplane0_v2i8(<2 x i8> %b) {
+; CHECK-LABEL: duplane0_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @loaddup_v2i8(ptr %p) {
+; CHECK-LABEL: loaddup_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <2 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
+  ret <2 x i8> %c
+}
+
+define <3 x i8> @dup_v3i8(i8 %a) {
+; CHECK-SD-LABEL: dup_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w1, w0
+; CHECK-SD-NEXT:    mov w2, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.8b, w0
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <3 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <3 x i8> %b, <3 x i8> poison, <3 x i32> zeroinitializer
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @duplane0_v3i8(<3 x i8> %b) {
+; CHECK-SD-LABEL: duplane0_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w1, w0
+; CHECK-SD-NEXT:    mov w2, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = shufflevector <3 x i8> %b, <3 x i8> poison, <3 x i32> zeroinitializer
+  ret <3 x i8> %c
+}
+
+define <3 x i8> @loaddup_v3i8(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrb w0, [x0]
+; CHECK-SD-NEXT:    mov w1, w0
+; CHECK-SD-NEXT:    mov w2, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <3 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <3 x i8> %b, <3 x i8> poison, <3 x i32> zeroinitializer
+  ret <3 x i8> %c
+}
+
+define <4 x i8> @dup_v4i8(i8 %a) {
+; CHECK-SD-LABEL: dup_v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    dup v0.4h, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.8b, w0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <4 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
+  ret <4 x i8> %c
+}
+
+define <4 x i8> @duplane0_v4i8(<4 x i8> %b) {
+; CHECK-SD-LABEL: duplane0_v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
+  ret <4 x i8> %c
+}
+
+define <4 x i8> @loaddup_v4i8(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrb w8, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <4 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
+  ret <4 x i8> %c
+}
+
+define <8 x i8> @dup_v8i8(i8 %a) {
+; CHECK-LABEL: dup_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8b, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
+  ret <8 x i8> %c
+}
+
+define <8 x i8> @duplane0_v8i8(<8 x i8> %b) {
+; CHECK-LABEL: duplane0_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
+  ret <8 x i8> %c
+}
+
+define <8 x i8> @loaddup_v8i8(ptr %p) {
+; CHECK-LABEL: loaddup_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <8 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
+  ret <8 x i8> %c
+}
+
+define <16 x i8> @dup_v16i8(i8 %a) {
+; CHECK-LABEL: dup_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.16b, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <16 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %c
+}
+
+define <16 x i8> @duplane0_v16i8(<16 x i8> %b) {
+; CHECK-LABEL: duplane0_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.16b, v0.b[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %c
+}
+
+define <16 x i8> @loaddup_v16i8(ptr %p) {
+; CHECK-LABEL: loaddup_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.16b }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <16 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %c
+}
+
+define <32 x i8> @dup_v32i8(i8 %a) {
+; CHECK-LABEL: dup_v32i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.16b, w0
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <32 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %c
+}
+
+define <32 x i8> @duplane0_v32i8(<32 x i8> %b) {
+; CHECK-LABEL: duplane0_v32i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.16b, v0.b[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %c
+}
+
+define <32 x i8> @loaddup_v32i8(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v32i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.16b }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v32i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.16b }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.16b }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i8, ptr %p
+  %b = insertelement <32 x i8> poison, i8 %a, i64 0
+  %c = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %c
+}
+
+define <2 x i16> @dup_v2i16(i16 %a) {
+; CHECK-SD-LABEL: dup_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    dup v0.2s, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.4h, w0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <2 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer
+  ret <2 x i16> %c
+}
+
+define <2 x i16> @duplane0_v2i16(<2 x i16> %b) {
+; CHECK-SD-LABEL: duplane0_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer
+  ret <2 x i16> %c
+}
+
+define <2 x i16> @loaddup_v2i16(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i16, ptr %p
+  %b = insertelement <2 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer
+  ret <2 x i16> %c
+}
+
+define <3 x i16> @dup_v3i16(i16 %a) {
+; CHECK-LABEL: dup_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4h, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <3 x i16> %b, <3 x i16> poison, <3 x i32> zeroinitializer
+  ret <3 x i16> %c
+}
+
+define <3 x i16> @duplane0_v3i16(<3 x i16> %b) {
+; CHECK-LABEL: duplane0_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x i16> %b, <3 x i16> poison, <3 x i32> zeroinitializer
+  ret <3 x i16> %c
+}
+
+define <3 x i16> @loaddup_v3i16(ptr %p) {
+; CHECK-LABEL: loaddup_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i16, ptr %p
+  %b = insertelement <3 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <3 x i16> %b, <3 x i16> poison, <3 x i32> zeroinitializer
+  ret <3 x i16> %c
+}
+
+define <4 x i16> @dup_v4i16(i16 %a) {
+; CHECK-LABEL: dup_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4h, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %c
+}
+
+define <4 x i16> @duplane0_v4i16(<4 x i16> %b) {
+; CHECK-LABEL: duplane0_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %c
+}
+
+define <4 x i16> @loaddup_v4i16(ptr %p) {
+; CHECK-LABEL: loaddup_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i16, ptr %p
+  %b = insertelement <4 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %c
+}
+
+define <8 x i16> @dup_v8i16(i16 %a) {
+; CHECK-LABEL: dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %c
+}
+
+define <8 x i16> @duplane0_v8i16(<8 x i16> %b) {
+; CHECK-LABEL: duplane0_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %c
+}
+
+define <8 x i16> @loaddup_v8i16(ptr %p) {
+; CHECK-LABEL: loaddup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i16, ptr %p
+  %b = insertelement <8 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %c
+}
+
+define <16 x i16> @dup_v16i16(i16 %a) {
+; CHECK-LABEL: dup_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, w0
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <16 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %c
+}
+
+define <16 x i16> @duplane0_v16i16(<16 x i16> %b) {
+; CHECK-LABEL: duplane0_v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %c
+}
+
+define <16 x i16> @loaddup_v16i16(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.8h }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i16, ptr %p
+  %b = insertelement <16 x i16> poison, i16 %a, i64 0
+  %c = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %c
+}
+
+define <2 x i32> @dup_v2i32(i32 %a) {
+; CHECK-LABEL: dup_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2s, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %c
+}
+
+define <2 x i32> @duplane0_v2i32(<2 x i32> %b) {
+; CHECK-LABEL: duplane0_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %c
+}
+
+define <2 x i32> @loaddup_v2i32(ptr %p) {
+; CHECK-LABEL: loaddup_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.2s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i32, ptr %p
+  %b = insertelement <2 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %c
+}
+
+define <3 x i32> @dup_v3i32(i32 %a) {
+; CHECK-LABEL: dup_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <3 x i32> %b, <3 x i32> poison, <3 x i32> zeroinitializer
+  ret <3 x i32> %c
+}
+
+define <3 x i32> @duplane0_v3i32(<3 x i32> %b) {
+; CHECK-LABEL: duplane0_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x i32> %b, <3 x i32> poison, <3 x i32> zeroinitializer
+  ret <3 x i32> %c
+}
+
+define <3 x i32> @loaddup_v3i32(ptr %p) {
+; CHECK-LABEL: loaddup_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i32, ptr %p
+  %b = insertelement <3 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <3 x i32> %b, <3 x i32> poison, <3 x i32> zeroinitializer
+  ret <3 x i32> %c
+}
+
+define <4 x i32> @dup_v4i32(i32 %a) {
+; CHECK-LABEL: dup_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, w0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @duplane0_v4i32(<4 x i32> %b) {
+; CHECK-LABEL: duplane0_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @loaddup_v4i32(ptr %p) {
+; CHECK-LABEL: loaddup_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i32, ptr %p
+  %b = insertelement <4 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %c
+}
+
+define <8 x i32> @dup_v8i32(i32 %a) {
+; CHECK-LABEL: dup_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, w0
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @duplane0_v8i32(<8 x i32> %b) {
+; CHECK-LABEL: duplane0_v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @loaddup_v8i32(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v8i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v8i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i32, ptr %p
+  %b = insertelement <8 x i32> poison, i32 %a, i64 0
+  %c = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %c
+}
+
+define <2 x i64> @dup_v2i64(i64 %a) {
+; CHECK-LABEL: dup_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, x0
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %c
+}
+
+define <2 x i64> @duplane0_v2i64(<2 x i64> %b) {
+; CHECK-LABEL: duplane0_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %c
+}
+
+define <2 x i64> @loaddup_v2i64(ptr %p) {
+; CHECK-LABEL: loaddup_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load i64, ptr %p
+  %b = insertelement <2 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %c
+}
+
+define <3 x i64> @dup_v3i64(i64 %a) {
+; CHECK-SD-LABEL: dup_v3i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v3i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <3 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <3 x i64> %b, <3 x i64> poison, <3 x i32> zeroinitializer
+  ret <3 x i64> %c
+}
+
+define <3 x i64> @duplane0_v3i64(<3 x i64> %b) {
+; CHECK-SD-LABEL: duplane0_v3i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v3i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.d[1], v1.d[0]
+; CHECK-GI-NEXT:    dup v0.2d, v2.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = shufflevector <3 x i64> %b, <3 x i64> poison, <3 x i32> zeroinitializer
+  ret <3 x i64> %c
+}
+
+define <3 x i64> @loaddup_v3i64(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v3i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v3i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-GI-NEXT:    ld1r { v2.2d }, [x0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i64, ptr %p
+  %b = insertelement <3 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <3 x i64> %b, <3 x i64> poison, <3 x i32> zeroinitializer
+  ret <3 x i64> %c
+}
+
+define <4 x i64> @dup_v4i64(i64 %a) {
+; CHECK-LABEL: dup_v4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, x0
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %c
+}
+
+define <4 x i64> @duplane0_v4i64(<4 x i64> %b) {
+; CHECK-LABEL: duplane0_v4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %c
+}
+
+define <4 x i64> @loaddup_v4i64(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.2d }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load i64, ptr %p
+  %b = insertelement <4 x i64> poison, i64 %a, i64 0
+  %c = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %c
+}
+
+define <2 x i128> @dup_v2i128(i128 %a) {
+; CHECK-LABEL: dup_v2i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <2 x i128> %b, <2 x i128> poison, <2 x i32> zeroinitializer
+  ret <2 x i128> %c
+}
+
+define <2 x i128> @duplane0_v2i128(<2 x i128> %b) {
+; CHECK-LABEL: duplane0_v2i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x i128> %b, <2 x i128> poison, <2 x i32> zeroinitializer
+  ret <2 x i128> %c
+}
+
+define <2 x i128> @loaddup_v2i128(ptr %p) {
+; CHECK-LABEL: loaddup_v2i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x2, x1, [x0]
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    ret
+entry:
+  %a = load i128, ptr %p
+  %b = insertelement <2 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <2 x i128> %b, <2 x i128> poison, <2 x i32> zeroinitializer
+  ret <2 x i128> %c
+}
+
+define <3 x i128> @dup_v3i128(i128 %a) {
+; CHECK-LABEL: dup_v3i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <3 x i128> %b, <3 x i128> poison, <3 x i32> zeroinitializer
+  ret <3 x i128> %c
+}
+
+define <3 x i128> @duplane0_v3i128(<3 x i128> %b) {
+; CHECK-LABEL: duplane0_v3i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x i128> %b, <3 x i128> poison, <3 x i32> zeroinitializer
+  ret <3 x i128> %c
+}
+
+define <3 x i128> @loaddup_v3i128(ptr %p) {
+; CHECK-LABEL: loaddup_v3i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x2, x1, [x0]
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x2
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    ret
+entry:
+  %a = load i128, ptr %p
+  %b = insertelement <3 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <3 x i128> %b, <3 x i128> poison, <3 x i32> zeroinitializer
+  ret <3 x i128> %c
+}
+
+define <4 x i128> @dup_v4i128(i128 %a) {
+; CHECK-LABEL: dup_v4i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    mov x6, x0
+; CHECK-NEXT:    mov x7, x1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <4 x i128> %b, <4 x i128> poison, <4 x i32> zeroinitializer
+  ret <4 x i128> %c
+}
+
+define <4 x i128> @duplane0_v4i128(<4 x i128> %b) {
+; CHECK-LABEL: duplane0_v4i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x2, x0
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x0
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    mov x6, x0
+; CHECK-NEXT:    mov x7, x1
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x i128> %b, <4 x i128> poison, <4 x i32> zeroinitializer
+  ret <4 x i128> %c
+}
+
+define <4 x i128> @loaddup_v4i128(ptr %p) {
+; CHECK-LABEL: loaddup_v4i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x2, x1, [x0]
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x3, x1
+; CHECK-NEXT:    mov x4, x2
+; CHECK-NEXT:    mov x5, x1
+; CHECK-NEXT:    mov x6, x2
+; CHECK-NEXT:    mov x7, x1
+; CHECK-NEXT:    ret
+entry:
+  %a = load i128, ptr %p
+  %b = insertelement <4 x i128> poison, i128 %a, i64 0
+  %c = shufflevector <4 x i128> %b, <4 x i128> poison, <4 x i32> zeroinitializer
+  ret <4 x i128> %c
+}
+
+define <2 x half> @dup_v2half(half %a) {
+; CHECK-LABEL: dup_v2half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x half> poison, half %a, i64 0
+  %c = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> zeroinitializer
+  ret <2 x half> %c
+}
+
+define <2 x half> @duplane0_v2half(<2 x half> %b) {
+; CHECK-LABEL: duplane0_v2half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> zeroinitializer
+  ret <2 x half> %c
+}
+
+define <2 x half> @loaddup_v2half(ptr %p) {
+; CHECK-LABEL: loaddup_v2half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load half, ptr %p
+  %b = insertelement <2 x half> poison, half %a, i64 0
+  %c = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> zeroinitializer
+  ret <2 x half> %c
+}
+
+define <3 x half> @dup_v3half(half %a) {
+; CHECK-LABEL: dup_v3half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x half> poison, half %a, i64 0
+  %c = shufflevector <3 x half> %b, <3 x half> poison, <3 x i32> zeroinitializer
+  ret <3 x half> %c
+}
+
+define <3 x half> @duplane0_v3half(<3 x half> %b) {
+; CHECK-LABEL: duplane0_v3half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x half> %b, <3 x half> poison, <3 x i32> zeroinitializer
+  ret <3 x half> %c
+}
+
+define <3 x half> @loaddup_v3half(ptr %p) {
+; CHECK-LABEL: loaddup_v3half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load half, ptr %p
+  %b = insertelement <3 x half> poison, half %a, i64 0
+  %c = shufflevector <3 x half> %b, <3 x half> poison, <3 x i32> zeroinitializer
+  ret <3 x half> %c
+}
+
+define <4 x half> @dup_v4half(half %a) {
+; CHECK-LABEL: dup_v4half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x half> poison, half %a, i64 0
+  %c = shufflevector <4 x half> %b, <4 x half> poison, <4 x i32> zeroinitializer
+  ret <4 x half> %c
+}
+
+define <4 x half> @duplane0_v4half(<4 x half> %b) {
+; CHECK-LABEL: duplane0_v4half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x half> %b, <4 x half> poison, <4 x i32> zeroinitializer
+  ret <4 x half> %c
+}
+
+define <4 x half> @loaddup_v4half(ptr %p) {
+; CHECK-LABEL: loaddup_v4half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load half, ptr %p
+  %b = insertelement <4 x half> poison, half %a, i64 0
+  %c = shufflevector <4 x half> %b, <4 x half> poison, <4 x i32> zeroinitializer
+  ret <4 x half> %c
+}
+
+define <8 x half> @dup_v8half(half %a) {
+; CHECK-LABEL: dup_v8half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x half> poison, half %a, i64 0
+  %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer
+  ret <8 x half> %c
+}
+
+define <8 x half> @duplane0_v8half(<8 x half> %b) {
+; CHECK-LABEL: duplane0_v8half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer
+  ret <8 x half> %c
+}
+
+define <8 x half> @loaddup_v8half(ptr %p) {
+; CHECK-LABEL: loaddup_v8half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load half, ptr %p
+  %b = insertelement <8 x half> poison, half %a, i64 0
+  %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer
+  ret <8 x half> %c
+}
+
+define <16 x half> @dup_v16half(half %a) {
+; CHECK-SD-LABEL: dup_v16half:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v16half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    dup v2.8h, v0.h[0]
+; CHECK-GI-NEXT:    dup v1.8h, v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <16 x half> poison, half %a, i64 0
+  %c = shufflevector <16 x half> %b, <16 x half> poison, <16 x i32> zeroinitializer
+  ret <16 x half> %c
+}
+
+define <16 x half> @duplane0_v16half(<16 x half> %b) {
+; CHECK-LABEL: duplane0_v16half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <16 x half> %b, <16 x half> poison, <16 x i32> zeroinitializer
+  ret <16 x half> %c
+}
+
+define <16 x half> @loaddup_v16half(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v16half:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v16half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.8h }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load half, ptr %p
+  %b = insertelement <16 x half> poison, half %a, i64 0
+  %c = shufflevector <16 x half> %b, <16 x half> poison, <16 x i32> zeroinitializer
+  ret <16 x half> %c
+}
+
+define <2 x bfloat> @dup_v2bfloat(bfloat %a) {
+; CHECK-LABEL: dup_v2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> zeroinitializer
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @duplane0_v2bfloat(<2 x bfloat> %b) {
+; CHECK-LABEL: duplane0_v2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> zeroinitializer
+  ret <2 x bfloat> %c
+}
+
+define <2 x bfloat> @loaddup_v2bfloat(ptr %p) {
+; CHECK-LABEL: loaddup_v2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p
+  %b = insertelement <2 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> zeroinitializer
+  ret <2 x bfloat> %c
+}
+
+define <3 x bfloat> @dup_v3bfloat(bfloat %a) {
+; CHECK-LABEL: dup_v3bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <3 x bfloat> %b, <3 x bfloat> poison, <3 x i32> zeroinitializer
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @duplane0_v3bfloat(<3 x bfloat> %b) {
+; CHECK-LABEL: duplane0_v3bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x bfloat> %b, <3 x bfloat> poison, <3 x i32> zeroinitializer
+  ret <3 x bfloat> %c
+}
+
+define <3 x bfloat> @loaddup_v3bfloat(ptr %p) {
+; CHECK-LABEL: loaddup_v3bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p
+  %b = insertelement <3 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <3 x bfloat> %b, <3 x bfloat> poison, <3 x i32> zeroinitializer
+  ret <3 x bfloat> %c
+}
+
+define <4 x bfloat> @dup_v4bfloat(bfloat %a) {
+; CHECK-LABEL: dup_v4bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <4 x bfloat> %b, <4 x bfloat> poison, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @duplane0_v4bfloat(<4 x bfloat> %b) {
+; CHECK-LABEL: duplane0_v4bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x bfloat> %b, <4 x bfloat> poison, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %c
+}
+
+define <4 x bfloat> @loaddup_v4bfloat(ptr %p) {
+; CHECK-LABEL: loaddup_v4bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p
+  %b = insertelement <4 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <4 x bfloat> %b, <4 x bfloat> poison, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %c
+}
+
+define <8 x bfloat> @dup_v8bfloat(bfloat %a) {
+; CHECK-LABEL: dup_v8bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @duplane0_v8bfloat(<8 x bfloat> %b) {
+; CHECK-LABEL: duplane0_v8bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <8 x bfloat> @loaddup_v8bfloat(ptr %p) {
+; CHECK-LABEL: loaddup_v8bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p
+  %b = insertelement <8 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %c
+}
+
+define <16 x bfloat> @dup_v16bfloat(bfloat %a) {
+; CHECK-SD-LABEL: dup_v16bfloat:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v16bfloat:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    dup v2.8h, v0.h[0]
+; CHECK-GI-NEXT:    dup v1.8h, v0.h[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @duplane0_v16bfloat(<16 x bfloat> %b) {
+; CHECK-LABEL: duplane0_v16bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <16 x bfloat> @loaddup_v16bfloat(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v16bfloat:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v16bfloat:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.8h }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p
+  %b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
+  %c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %c
+}
+
+define <2 x float> @dup_v2float(float %a) {
+; CHECK-LABEL: dup_v2float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x float> poison, float %a, i64 0
+  %c = shufflevector <2 x float> %b, <2 x float> poison, <2 x i32> zeroinitializer
+  ret <2 x float> %c
+}
+
+define <2 x float> @duplane0_v2float(<2 x float> %b) {
+; CHECK-LABEL: duplane0_v2float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x float> %b, <2 x float> poison, <2 x i32> zeroinitializer
+  ret <2 x float> %c
+}
+
+define <2 x float> @loaddup_v2float(ptr %p) {
+; CHECK-LABEL: loaddup_v2float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.2s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load float, ptr %p
+  %b = insertelement <2 x float> poison, float %a, i64 0
+  %c = shufflevector <2 x float> %b, <2 x float> poison, <2 x i32> zeroinitializer
+  ret <2 x float> %c
+}
+
+define <3 x float> @dup_v3float(float %a) {
+; CHECK-LABEL: dup_v3float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x float> poison, float %a, i64 0
+  %c = shufflevector <3 x float> %b, <3 x float> poison, <3 x i32> zeroinitializer
+  ret <3 x float> %c
+}
+
+define <3 x float> @duplane0_v3float(<3 x float> %b) {
+; CHECK-LABEL: duplane0_v3float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x float> %b, <3 x float> poison, <3 x i32> zeroinitializer
+  ret <3 x float> %c
+}
+
+define <3 x float> @loaddup_v3float(ptr %p) {
+; CHECK-LABEL: loaddup_v3float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load float, ptr %p
+  %b = insertelement <3 x float> poison, float %a, i64 0
+  %c = shufflevector <3 x float> %b, <3 x float> poison, <3 x i32> zeroinitializer
+  ret <3 x float> %c
+}
+
+define <4 x float> @dup_v4float(float %a) {
+; CHECK-LABEL: dup_v4float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x float> poison, float %a, i64 0
+  %c = shufflevector <4 x float> %b, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <4 x float> @duplane0_v4float(<4 x float> %b) {
+; CHECK-LABEL: duplane0_v4float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x float> %b, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <4 x float> @loaddup_v4float(ptr %p) {
+; CHECK-LABEL: loaddup_v4float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load float, ptr %p
+  %b = insertelement <4 x float> poison, float %a, i64 0
+  %c = shufflevector <4 x float> %b, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <8 x float> @dup_v8float(float %a) {
+; CHECK-SD-LABEL: dup_v8float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v8float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    dup v2.4s, v0.s[0]
+; CHECK-GI-NEXT:    dup v1.4s, v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <8 x float> poison, float %a, i64 0
+  %c = shufflevector <8 x float> %b, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %c
+}
+
+define <8 x float> @duplane0_v8float(<8 x float> %b) {
+; CHECK-LABEL: duplane0_v8float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <8 x float> %b, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %c
+}
+
+define <8 x float> @loaddup_v8float(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v8float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v8float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.4s }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load float, ptr %p
+  %b = insertelement <8 x float> poison, float %a, i64 0
+  %c = shufflevector <8 x float> %b, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %c
+}
+
+define <2 x double> @dup_v2double(double %a) {
+; CHECK-LABEL: dup_v2double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x double> poison, double %a, i64 0
+  %c = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %c
+}
+
+define <2 x double> @duplane0_v2double(<2 x double> %b) {
+; CHECK-LABEL: duplane0_v2double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %c
+}
+
+define <2 x double> @loaddup_v2double(ptr %p) {
+; CHECK-LABEL: loaddup_v2double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %a = load double, ptr %p
+  %b = insertelement <2 x double> poison, double %a, i64 0
+  %c = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %c
+}
+
+define <3 x double> @dup_v3double(double %a) {
+; CHECK-SD-LABEL: dup_v3double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v3double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    dup v3.2d, v0.d[0]
+; CHECK-GI-NEXT:    dup v2.2d, v0.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fmov d0, d3
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <3 x double> poison, double %a, i64 0
+  %c = shufflevector <3 x double> %b, <3 x double> poison, <3 x i32> zeroinitializer
+  ret <3 x double> %c
+}
+
+define <3 x double> @duplane0_v3double(<3 x double> %b) {
+; CHECK-SD-LABEL: duplane0_v3double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v3double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.d[1], v1.d[0]
+; CHECK-GI-NEXT:    dup v0.2d, v2.d[0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = shufflevector <3 x double> %b, <3 x double> poison, <3 x i32> zeroinitializer
+  ret <3 x double> %c
+}
+
+define <3 x double> @loaddup_v3double(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v3double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    fmov d1, d0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v3double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-GI-NEXT:    ld1r { v2.2d }, [x0]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load double, ptr %p
+  %b = insertelement <3 x double> poison, double %a, i64 0
+  %c = shufflevector <3 x double> %b, <3 x double> poison, <3 x i32> zeroinitializer
+  ret <3 x double> %c
+}
+
+define <4 x double> @dup_v4double(double %a) {
+; CHECK-SD-LABEL: dup_v4double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v4double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    dup v2.2d, v0.d[0]
+; CHECK-GI-NEXT:    dup v1.2d, v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %b = insertelement <4 x double> poison, double %a, i64 0
+  %c = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <4 x double> @duplane0_v4double(<4 x double> %b) {
+; CHECK-LABEL: duplane0_v4double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <4 x double> @loaddup_v4double(ptr %p) {
+; CHECK-SD-LABEL: loaddup_v4double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v4double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.2d }, [x0]
+; CHECK-GI-NEXT:    ld1r { v1.2d }, [x0]
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = load double, ptr %p
+  %b = insertelement <4 x double> poison, double %a, i64 0
+  %c = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <2 x fp128> @dup_v2fp128(fp128 %a) {
+; CHECK-LABEL: dup_v2fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <2 x fp128> %b, <2 x fp128> poison, <2 x i32> zeroinitializer
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @duplane0_v2fp128(<2 x fp128> %b) {
+; CHECK-LABEL: duplane0_v2fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <2 x fp128> %b, <2 x fp128> poison, <2 x i32> zeroinitializer
+  ret <2 x fp128> %c
+}
+
+define <2 x fp128> @loaddup_v2fp128(ptr %p) {
+; CHECK-LABEL: loaddup_v2fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %a = load fp128, ptr %p
+  %b = insertelement <2 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <2 x fp128> %b, <2 x fp128> poison, <2 x i32> zeroinitializer
+  ret <2 x fp128> %c
+}
+
+define <3 x fp128> @dup_v3fp128(fp128 %a) {
+; CHECK-LABEL: dup_v3fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <3 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <3 x fp128> %b, <3 x fp128> poison, <3 x i32> zeroinitializer
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @duplane0_v3fp128(<3 x fp128> %b) {
+; CHECK-LABEL: duplane0_v3fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <3 x fp128> %b, <3 x fp128> poison, <3 x i32> zeroinitializer
+  ret <3 x fp128> %c
+}
+
+define <3 x fp128> @loaddup_v3fp128(ptr %p) {
+; CHECK-LABEL: loaddup_v3fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %a = load fp128, ptr %p
+  %b = insertelement <3 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <3 x fp128> %b, <3 x fp128> poison, <3 x i32> zeroinitializer
+  ret <3 x fp128> %c
+}
+
+define <4 x fp128> @dup_v4fp128(fp128 %a) {
+; CHECK-LABEL: dup_v4fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    mov v3.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <4 x fp128> %b, <4 x fp128> poison, <4 x i32> zeroinitializer
+  ret <4 x fp128> %c
+}
+
+define <4 x fp128> @duplane0_v4fp128(<4 x fp128> %b) {
+; CHECK-LABEL: duplane0_v4fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    mov v3.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = shufflevector <4 x fp128> %b, <4 x fp128> poison, <4 x i32> zeroinitializer
+  ret <4 x fp128> %c
+}
+
+define <4 x fp128> @loaddup_v4fp128(ptr %p) {
+; CHECK-LABEL: loaddup_v4fp128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    mov v3.16b, v0.16b
+; CHECK-NEXT:    ret
+entry:
+  %a = load fp128, ptr %p
+  %b = insertelement <4 x fp128> poison, fp128 %a, i64 0
+  %c = shufflevector <4 x fp128> %b, <4 x fp128> poison, <4 x i32> zeroinitializer
+  ret <4 x fp128> %c
+}
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
new file mode 100644
index 00000000000000..7f587ac0b87161
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -0,0 +1,455 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+
+define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
+; GFX11-LABEL: group_image_sample:
+; GFX11:       ; %bb.0: ; %.entry
+; GFX11-NEXT:    s_mov_b32 s24, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_mov_b32 m0, s4
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    s_mov_b32 s6, s3
+; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s7, s5
+; GFX11-NEXT:    s_load_b128 s[12:15], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
+; GFX11-NEXT:    s_load_b256 s[0:7], s[6:7], 0x0
+; GFX11-NEXT:    s_mov_b32 s16, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    lds_param_load v2, attr0.y wait_vdst:15
+; GFX11-NEXT:    lds_param_load v3, attr0.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x10
+; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0x20
+; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0x30
+; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0x40
+; GFX11-NEXT:    v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
+; GFX11-NEXT:    v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
+; GFX11-NEXT:    v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
+; GFX11-NEXT:    v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x50
+; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0x60
+; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0x70
+; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0x80
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
+; GFX11-NEXT:    v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
+; GFX11-NEXT:    v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
+; GFX11-NEXT:    v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x90
+; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0xa0
+; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0xb0
+; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0xc0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
+; GFX11-NEXT:    v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
+; GFX11-NEXT:    v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
+; GFX11-NEXT:    v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0xd0
+; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0xe0
+; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0xf0
+; GFX11-NEXT:    s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
+; GFX11-NEXT:    v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
+; GFX11-NEXT:    v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
+; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s24
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_dual_add_f32 v0, v8, v4 :: v_dual_add_f32 v1, v9, v5
+; GFX11-NEXT:    v_dual_add_f32 v4, v10, v6 :: v_dual_add_f32 v5, v11, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v12, v0 :: v_dual_add_f32 v1, v13, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v14, v4 :: v_dual_add_f32 v5, v15, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v16, v0 :: v_dual_add_f32 v1, v17, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v18, v4 :: v_dual_add_f32 v5, v19, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v20, v0 :: v_dual_add_f32 v1, v21, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v22, v4 :: v_dual_add_f32 v5, v23, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v24, v0 :: v_dual_add_f32 v1, v25, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v26, v4 :: v_dual_add_f32 v5, v27, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v28, v0 :: v_dual_add_f32 v1, v29, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v30, v4 :: v_dual_add_f32 v5, v31, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v32, v0 :: v_dual_add_f32 v1, v33, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v34, v4 :: v_dual_add_f32 v5, v35, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v36, v0 :: v_dual_add_f32 v1, v37, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v38, v4 :: v_dual_add_f32 v5, v39, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v40, v0 :: v_dual_add_f32 v1, v41, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v42, v4 :: v_dual_add_f32 v5, v43, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v44, v0 :: v_dual_add_f32 v1, v45, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v46, v4 :: v_dual_add_f32 v5, v47, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v48, v0 :: v_dual_add_f32 v1, v49, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v50, v4 :: v_dual_add_f32 v5, v51, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v52, v0 :: v_dual_add_f32 v1, v53, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v54, v4 :: v_dual_add_f32 v5, v55, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v56, v0 :: v_dual_add_f32 v1, v57, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v58, v4 :: v_dual_add_f32 v5, v59, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v60, v0 :: v_dual_add_f32 v1, v61, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v62, v4 :: v_dual_add_f32 v5, v63, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v64, v0 :: v_dual_add_f32 v1, v65, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v66, v4 :: v_dual_add_f32 v5, v67, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5
+; GFX11-NEXT:    exp mrt0 v0, v1, off, off done
+; GFX11-NEXT:    s_endpgm
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %userdata6 to i64
+  %i3 = or disjoint i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %i6 = zext i32 %userdata7 to i64
+  %i7 = or disjoint i64 %i1, %i6
+  %i8 = inttoptr i64 %i7 to ptr addrspace(4)
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 4, !invariant.load !0
+  %i10 = zext i32 %userdata8 to i64
+  %i11 = or disjoint i64 %i1, %i10
+  %i12 = inttoptr i64 %i11 to ptr addrspace(4)
+  %i13 = load <8 x i32>, ptr addrspace(4) %i12, align 4, !invariant.load !0
+  %i14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask)
+  %PerspInterpCenter.i1 = extractelement <2 x float> %PerspInterpCenter, i64 1
+  %PerspInterpCenter.i0 = extractelement <2 x float> %PerspInterpCenter, i64 0
+  %i15 = call float @llvm.amdgcn.interp.inreg.p10(float %i14, float %PerspInterpCenter.i0, float %i14)
+  %i16 = call float @llvm.amdgcn.interp.inreg.p2(float %i14, float %PerspInterpCenter.i1, float %i15)
+  %i17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask)
+  %i18 = call float @llvm.amdgcn.interp.inreg.p10(float %i17, float %PerspInterpCenter.i0, float %i17)
+  %i19 = call float @llvm.amdgcn.interp.inreg.p2(float %i17, float %PerspInterpCenter.i1, float %i18)
+  %i20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 16, i32 0), !invariant.load !0
+  %i21 = shufflevector <2 x i32> %i20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i22 = bitcast <4 x i32> %i21 to <4 x float>
+  %.i0 = extractelement <4 x float> %i22, i64 0
+  %.i1 = extractelement <4 x float> %i22, i64 1
+  %.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %i19
+  %.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %i16
+  %i23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i03, float %.i14, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i010 = extractelement <4 x float> %i23, i64 0
+  %.i113 = extractelement <4 x float> %i23, i64 1
+  %.i215 = extractelement <4 x float> %i23, i64 2
+  %.i317 = extractelement <4 x float> %i23, i64 3
+  %i24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 32, i32 0), !invariant.load !0
+  %i25 = shufflevector <2 x i32> %i24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i26 = bitcast <4 x i32> %i25 to <4 x float>
+  %.i05 = extractelement <4 x float> %i26, i64 0
+  %.i16 = extractelement <4 x float> %i26, i64 1
+  %.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %i19
+  %.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %i16
+  %i27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i07, float %.i18, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i09 = extractelement <4 x float> %i27, i64 0
+  %.i011 = fadd reassoc nnan nsz arcp contract afn float %.i09, %.i010
+  %.i112 = extractelement <4 x float> %i27, i64 1
+  %.i114 = fadd reassoc nnan nsz arcp contract afn float %.i112, %.i113
+  %.i2 = extractelement <4 x float> %i27, i64 2
+  %.i216 = fadd reassoc nnan nsz arcp contract afn float %.i2, %.i215
+  %.i3 = extractelement <4 x float> %i27, i64 3
+  %.i318 = fadd reassoc nnan nsz arcp contract afn float %.i3, %.i317
+  %i28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 48, i32 0), !invariant.load !0
+  %i29 = shufflevector <2 x i32> %i28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i30 = bitcast <4 x i32> %i29 to <4 x float>
+  %.i019 = extractelement <4 x float> %i30, i64 0
+  %.i120 = extractelement <4 x float> %i30, i64 1
+  %.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %i19
+  %.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %i16
+  %i31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i021, float %.i122, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i023 = extractelement <4 x float> %i31, i64 0
+  %.i024 = fadd reassoc nnan nsz arcp contract afn float %.i023, %.i011
+  %.i125 = extractelement <4 x float> %i31, i64 1
+  %.i126 = fadd reassoc nnan nsz arcp contract afn float %.i125, %.i114
+  %.i227 = extractelement <4 x float> %i31, i64 2
+  %.i228 = fadd reassoc nnan nsz arcp contract afn float %.i227, %.i216
+  %.i329 = extractelement <4 x float> %i31, i64 3
+  %.i330 = fadd reassoc nnan nsz arcp contract afn float %.i329, %.i318
+  %i32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 64, i32 0), !invariant.load !0
+  %i33 = shufflevector <2 x i32> %i32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i34 = bitcast <4 x i32> %i33 to <4 x float>
+  %.i031 = extractelement <4 x float> %i34, i64 0
+  %.i132 = extractelement <4 x float> %i34, i64 1
+  %.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %i19
+  %.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %i16
+  %i35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i033, float %.i134, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i035 = extractelement <4 x float> %i35, i64 0
+  %.i036 = fadd reassoc nnan nsz arcp contract afn float %.i035, %.i024
+  %.i137 = extractelement <4 x float> %i35, i64 1
+  %.i138 = fadd reassoc nnan nsz arcp contract afn float %.i137, %.i126
+  %.i239 = extractelement <4 x float> %i35, i64 2
+  %.i240 = fadd reassoc nnan nsz arcp contract afn float %.i239, %.i228
+  %.i341 = extractelement <4 x float> %i35, i64 3
+  %.i342 = fadd reassoc nnan nsz arcp contract afn float %.i341, %.i330
+  %i36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 80, i32 0), !invariant.load !0
+  %i37 = shufflevector <2 x i32> %i36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i38 = bitcast <4 x i32> %i37 to <4 x float>
+  %.i043 = extractelement <4 x float> %i38, i64 0
+  %.i144 = extractelement <4 x float> %i38, i64 1
+  %.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %i19
+  %.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %i16
+  %i39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i045, float %.i146, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i047 = extractelement <4 x float> %i39, i64 0
+  %.i048 = fadd reassoc nnan nsz arcp contract afn float %.i047, %.i036
+  %.i149 = extractelement <4 x float> %i39, i64 1
+  %.i150 = fadd reassoc nnan nsz arcp contract afn float %.i149, %.i138
+  %.i251 = extractelement <4 x float> %i39, i64 2
+  %.i252 = fadd reassoc nnan nsz arcp contract afn float %.i251, %.i240
+  %.i353 = extractelement <4 x float> %i39, i64 3
+  %.i354 = fadd reassoc nnan nsz arcp contract afn float %.i353, %.i342
+  %i40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 96, i32 0), !invariant.load !0
+  %i41 = shufflevector <2 x i32> %i40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i42 = bitcast <4 x i32> %i41 to <4 x float>
+  %.i055 = extractelement <4 x float> %i42, i64 0
+  %.i156 = extractelement <4 x float> %i42, i64 1
+  %.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %i19
+  %.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %i16
+  %i43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i057, float %.i158, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i059 = extractelement <4 x float> %i43, i64 0
+  %.i060 = fadd reassoc nnan nsz arcp contract afn float %.i059, %.i048
+  %.i161 = extractelement <4 x float> %i43, i64 1
+  %.i162 = fadd reassoc nnan nsz arcp contract afn float %.i161, %.i150
+  %.i263 = extractelement <4 x float> %i43, i64 2
+  %.i264 = fadd reassoc nnan nsz arcp contract afn float %.i263, %.i252
+  %.i365 = extractelement <4 x float> %i43, i64 3
+  %.i366 = fadd reassoc nnan nsz arcp contract afn float %.i365, %.i354
+  %i44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 112, i32 0), !invariant.load !0
+  %i45 = shufflevector <2 x i32> %i44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i46 = bitcast <4 x i32> %i45 to <4 x float>
+  %.i067 = extractelement <4 x float> %i46, i64 0
+  %.i168 = extractelement <4 x float> %i46, i64 1
+  %.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %i19
+  %.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %i16
+  %i47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i069, float %.i170, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i071 = extractelement <4 x float> %i47, i64 0
+  %.i072 = fadd reassoc nnan nsz arcp contract afn float %.i071, %.i060
+  %.i173 = extractelement <4 x float> %i47, i64 1
+  %.i174 = fadd reassoc nnan nsz arcp contract afn float %.i173, %.i162
+  %.i275 = extractelement <4 x float> %i47, i64 2
+  %.i276 = fadd reassoc nnan nsz arcp contract afn float %.i275, %.i264
+  %.i377 = extractelement <4 x float> %i47, i64 3
+  %.i378 = fadd reassoc nnan nsz arcp contract afn float %.i377, %.i366
+  %i48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 128, i32 0), !invariant.load !0
+  %i49 = shufflevector <2 x i32> %i48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i50 = bitcast <4 x i32> %i49 to <4 x float>
+  %.i079 = extractelement <4 x float> %i50, i64 0
+  %.i180 = extractelement <4 x float> %i50, i64 1
+  %.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %i19
+  %.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %i16
+  %i51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i081, float %.i182, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i083 = extractelement <4 x float> %i51, i64 0
+  %.i084 = fadd reassoc nnan nsz arcp contract afn float %.i083, %.i072
+  %.i185 = extractelement <4 x float> %i51, i64 1
+  %.i186 = fadd reassoc nnan nsz arcp contract afn float %.i185, %.i174
+  %.i287 = extractelement <4 x float> %i51, i64 2
+  %.i288 = fadd reassoc nnan nsz arcp contract afn float %.i287, %.i276
+  %.i389 = extractelement <4 x float> %i51, i64 3
+  %.i390 = fadd reassoc nnan nsz arcp contract afn float %.i389, %.i378
+  %i52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 144, i32 0), !invariant.load !0
+  %i53 = shufflevector <2 x i32> %i52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i54 = bitcast <4 x i32> %i53 to <4 x float>
+  %.i091 = extractelement <4 x float> %i54, i64 0
+  %.i192 = extractelement <4 x float> %i54, i64 1
+  %.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %i19
+  %.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %i16
+  %i55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i093, float %.i194, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i095 = extractelement <4 x float> %i55, i64 0
+  %.i096 = fadd reassoc nnan nsz arcp contract afn float %.i095, %.i084
+  %.i197 = extractelement <4 x float> %i55, i64 1
+  %.i198 = fadd reassoc nnan nsz arcp contract afn float %.i197, %.i186
+  %.i299 = extractelement <4 x float> %i55, i64 2
+  %.i2100 = fadd reassoc nnan nsz arcp contract afn float %.i299, %.i288
+  %.i3101 = extractelement <4 x float> %i55, i64 3
+  %.i3102 = fadd reassoc nnan nsz arcp contract afn float %.i3101, %.i390
+  %i56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 160, i32 0), !invariant.load !0
+  %i57 = shufflevector <2 x i32> %i56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i58 = bitcast <4 x i32> %i57 to <4 x float>
+  %.i0103 = extractelement <4 x float> %i58, i64 0
+  %.i1104 = extractelement <4 x float> %i58, i64 1
+  %.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %i19
+  %.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %i16
+  %i59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0105, float %.i1106, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0107 = extractelement <4 x float> %i59, i64 0
+  %.i0108 = fadd reassoc nnan nsz arcp contract afn float %.i0107, %.i096
+  %.i1109 = extractelement <4 x float> %i59, i64 1
+  %.i1110 = fadd reassoc nnan nsz arcp contract afn float %.i1109, %.i198
+  %.i2111 = extractelement <4 x float> %i59, i64 2
+  %.i2112 = fadd reassoc nnan nsz arcp contract afn float %.i2111, %.i2100
+  %.i3113 = extractelement <4 x float> %i59, i64 3
+  %.i3114 = fadd reassoc nnan nsz arcp contract afn float %.i3113, %.i3102
+  %i60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 176, i32 0), !invariant.load !0
+  %i61 = shufflevector <2 x i32> %i60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i62 = bitcast <4 x i32> %i61 to <4 x float>
+  %.i0115 = extractelement <4 x float> %i62, i64 0
+  %.i1116 = extractelement <4 x float> %i62, i64 1
+  %.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %i19
+  %.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %i16
+  %i63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0117, float %.i1118, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0119 = extractelement <4 x float> %i63, i64 0
+  %.i0120 = fadd reassoc nnan nsz arcp contract afn float %.i0119, %.i0108
+  %.i1121 = extractelement <4 x float> %i63, i64 1
+  %.i1122 = fadd reassoc nnan nsz arcp contract afn float %.i1121, %.i1110
+  %.i2123 = extractelement <4 x float> %i63, i64 2
+  %.i2124 = fadd reassoc nnan nsz arcp contract afn float %.i2123, %.i2112
+  %.i3125 = extractelement <4 x float> %i63, i64 3
+  %.i3126 = fadd reassoc nnan nsz arcp contract afn float %.i3125, %.i3114
+  %i64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 192, i32 0), !invariant.load !0
+  %i65 = shufflevector <2 x i32> %i64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i66 = bitcast <4 x i32> %i65 to <4 x float>
+  %.i0127 = extractelement <4 x float> %i66, i64 0
+  %.i1128 = extractelement <4 x float> %i66, i64 1
+  %.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %i19
+  %.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %i16
+  %i67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0129, float %.i1130, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0131 = extractelement <4 x float> %i67, i64 0
+  %.i0132 = fadd reassoc nnan nsz arcp contract afn float %.i0131, %.i0120
+  %.i1133 = extractelement <4 x float> %i67, i64 1
+  %.i1134 = fadd reassoc nnan nsz arcp contract afn float %.i1133, %.i1122
+  %.i2135 = extractelement <4 x float> %i67, i64 2
+  %.i2136 = fadd reassoc nnan nsz arcp contract afn float %.i2135, %.i2124
+  %.i3137 = extractelement <4 x float> %i67, i64 3
+  %.i3138 = fadd reassoc nnan nsz arcp contract afn float %.i3137, %.i3126
+  %i68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 208, i32 0), !invariant.load !0
+  %i69 = shufflevector <2 x i32> %i68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i70 = bitcast <4 x i32> %i69 to <4 x float>
+  %.i0139 = extractelement <4 x float> %i70, i64 0
+  %.i1140 = extractelement <4 x float> %i70, i64 1
+  %.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %i19
+  %.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %i16
+  %i71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0141, float %.i1142, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0143 = extractelement <4 x float> %i71, i64 0
+  %.i0144 = fadd reassoc nnan nsz arcp contract afn float %.i0143, %.i0132
+  %.i1145 = extractelement <4 x float> %i71, i64 1
+  %.i1146 = fadd reassoc nnan nsz arcp contract afn float %.i1145, %.i1134
+  %.i2147 = extractelement <4 x float> %i71, i64 2
+  %.i2148 = fadd reassoc nnan nsz arcp contract afn float %.i2147, %.i2136
+  %.i3149 = extractelement <4 x float> %i71, i64 3
+  %.i3150 = fadd reassoc nnan nsz arcp contract afn float %.i3149, %.i3138
+  %i72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 224, i32 0), !invariant.load !0
+  %i73 = shufflevector <2 x i32> %i72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i74 = bitcast <4 x i32> %i73 to <4 x float>
+  %.i0151 = extractelement <4 x float> %i74, i64 0
+  %.i1152 = extractelement <4 x float> %i74, i64 1
+  %.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %i19
+  %.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %i16
+  %i75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0153, float %.i1154, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0155 = extractelement <4 x float> %i75, i64 0
+  %.i0156 = fadd reassoc nnan nsz arcp contract afn float %.i0155, %.i0144
+  %.i1157 = extractelement <4 x float> %i75, i64 1
+  %.i1158 = fadd reassoc nnan nsz arcp contract afn float %.i1157, %.i1146
+  %.i2159 = extractelement <4 x float> %i75, i64 2
+  %.i2160 = fadd reassoc nnan nsz arcp contract afn float %.i2159, %.i2148
+  %.i3161 = extractelement <4 x float> %i75, i64 3
+  %.i3162 = fadd reassoc nnan nsz arcp contract afn float %.i3161, %.i3150
+  %i76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 240, i32 0), !invariant.load !0
+  %i77 = shufflevector <2 x i32> %i76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i78 = bitcast <4 x i32> %i77 to <4 x float>
+  %.i0163 = extractelement <4 x float> %i78, i64 0
+  %.i1164 = extractelement <4 x float> %i78, i64 1
+  %.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %i19
+  %.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %i16
+  %i79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0165, float %.i1166, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0167 = extractelement <4 x float> %i79, i64 0
+  %.i0168 = fadd reassoc nnan nsz arcp contract afn float %.i0167, %.i0156
+  %.i1169 = extractelement <4 x float> %i79, i64 1
+  %.i1170 = fadd reassoc nnan nsz arcp contract afn float %.i1169, %.i1158
+  %.i2171 = extractelement <4 x float> %i79, i64 2
+  %.i2172 = fadd reassoc nnan nsz arcp contract afn float %.i2171, %.i2160
+  %.i3173 = extractelement <4 x float> %i79, i64 3
+  %.i3174 = fadd reassoc nnan nsz arcp contract afn float %.i3173, %.i3162
+  %i80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 256, i32 0), !invariant.load !0
+  %i81 = shufflevector <2 x i32> %i80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %i82 = bitcast <4 x i32> %i81 to <4 x float>
+  %.i0175 = extractelement <4 x float> %i82, i64 0
+  %.i1176 = extractelement <4 x float> %i82, i64 1
+  %.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %i19
+  %.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %i16
+  %i83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0177, float %.i1178, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+  %.i0179 = extractelement <4 x float> %i83, i64 0
+  %.i0180 = fadd reassoc nnan nsz arcp contract afn float %.i0179, %.i0168
+  %.i1181 = extractelement <4 x float> %i83, i64 1
+  %.i1182 = fadd reassoc nnan nsz arcp contract afn float %.i1181, %.i1170
+  %.i2183 = extractelement <4 x float> %i83, i64 2
+  %.i2184 = fadd reassoc nnan nsz arcp contract afn float %.i2183, %.i2172
+  %.i3185 = extractelement <4 x float> %i83, i64 3
+  %.i3186 = fadd reassoc nnan nsz arcp contract afn float %.i3185, %.i3174
+  %i84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182)
+  %i85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186)
+  %i86 = bitcast <2 x half> %i84 to float
+  %i87 = bitcast <2 x half> %i85 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %i86, float %i87, float poison, float poison, i1 true, i1 true)
+  ret void
+}
+
+declare noundef i64 @llvm.amdgcn.s.getpc() #3
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #5
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #4
+declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #3
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
+
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #8 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
index 4b6e204ecf9570..c2cd4653bc9bf3 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-sched-strategy=max-ilp -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
 
 ---
 name:            max-ilp-liveness-tracking
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
index 11602b1d353f91..350ff94373a725 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
 
diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll
index f29ecb8c313738..a85ce716fbdfab 100644
--- a/llvm/test/Transforms/InstCombine/trunc.ll
+++ b/llvm/test/Transforms/InstCombine/trunc.ll
@@ -1129,8 +1129,7 @@ define i1 @trunc_nuw_i1_non_zero(i8 %1) {
 ; CHECK-LABEL: @trunc_nuw_i1_non_zero(
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
-; CHECK-NEXT:    [[RET:%.*]] = trunc nuw i8 [[TMP0]] to i1
-; CHECK-NEXT:    ret i1 [[RET]]
+; CHECK-NEXT:    ret i1 true
 ;
   %3 = icmp ne i8 %1, 0
   tail call void @llvm.assume(i1 %3)
@@ -1172,3 +1171,37 @@ define i1 @neg_trunc_i1_non_zero(i8 %1) {
   %ret = trunc i8 %1 to i1
   ret i1 %ret
 }
+
+define i1 @trunc_nsw_i1_non_zero(i8 %1) {
+; CHECK-LABEL: @trunc_nsw_i1_non_zero(
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP0:%.*]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
+; CHECK-NEXT:    ret i1 true
+;
+  %3 = icmp ne i8 %1, 0
+  tail call void @llvm.assume(i1 %3)
+  %ret = trunc nsw i8 %1 to i1
+  ret i1 %ret
+}
+
+define i1 @neg_trunc_nsw_i1_maybe_zero(i8 %1) {
+; CHECK-LABEL: @neg_trunc_nsw_i1_maybe_zero(
+; CHECK-NEXT:    [[RET:%.*]] = trunc nsw i8 [[TMP0:%.*]] to i1
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %ret = trunc nsw i8 %1 to i1
+  ret i1 %ret
+}
+
+define i2 @neg_trunc_nsw_i2_non_zero(i8 %1) {
+; CHECK-LABEL: @neg_trunc_nsw_i2_non_zero(
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP0:%.*]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
+; CHECK-NEXT:    [[RET:%.*]] = trunc nsw i8 [[TMP0]] to i2
+; CHECK-NEXT:    ret i2 [[RET]]
+;
+  %3 = icmp ne i8 %1, 0
+  tail call void @llvm.assume(i1 %3)
+  %ret = trunc nsw i8 %1 to i2
+  ret i2 %ret
+}
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
index 5e2edcef09bf8c..67c727a6c1c5df 100644
--- a/llvm/unittests/TargetParser/Host.cpp
+++ b/llvm/unittests/TargetParser/Host.cpp
@@ -140,6 +140,9 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x51\n"
                                               "CPU part        : 0x001"),
             "oryon-1");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x46\n"
+                                              "CPU part        : 0x003"),
+            "fujitsu-monaka");
 
   // MSM8992/4 weirdness
   StringRef MSM8992ProcCpuInfo = R"(
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index b74255c14dd0bd..1f69190e4bec53 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1158,13 +1158,14 @@ INSTANTIATE_TEST_SUITE_P(
                       AArch64CPUTestParams("thunderxt88", "armv8-a"),
                       AArch64CPUTestParams("tsv110", "armv8.2-a"),
                       AArch64CPUTestParams("a64fx", "armv8.2-a"),
+                      AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"),
                       AArch64CPUTestParams("carmel", "armv8.2-a"),
                       AArch64CPUTestParams("saphira", "armv8.4-a"),
                       AArch64CPUTestParams("oryon-1", "armv8.6-a")),
     AArch64CPUTestParams::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 81;
+static constexpr unsigned NumAArch64CPUArchs = 82;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;