diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index 7afad5b15b2d5..e17d741b0a00e 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -33,7 +33,7 @@ to format C/C++/Java/JavaScript/JSON/Objective-C/Protobuf/C# code.
   Clang-format options:
 
     --Werror                       - If set, changes formatting warnings to errors
-    --Wno-error=<value>            - If set don't error out on the specified warning type.
+    --Wno-error=<value>            - If set, don't error out on the specified warning type.
       =unknown                     -   If set, unknown format options are only warned about.
                                        This can be used to enable formatting, even if the
                                        configuration contains unknown (newer) options.
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 3f996ceaff156..481362dba3f51 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -290,7 +290,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | memory management            | changes to omp_alloctrait_key enum                           | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory model                 | seq_cst clause on flush construct                            | :none:`unclaimed`        |                                                                       |
+| memory model                 | seq_cst clause on flush construct                            | :good:`done`             | https://github.com/llvm/llvm-project/pull/114072                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | 'omp_all_memory' keyword and use in 'depend' clause          | :good:`done`             | D125828, D126321                                                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 88a9f247c9a5b..5881684a07dad 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -498,6 +498,8 @@ Attribute Changes in Clang
 - Clang now supports ``[[clang::lifetime_capture_by(X)]]``. Similar to lifetimebound, this can be
   used to specify when a reference to a function parameter is captured by another capturing entity ``X``.
 
+- The ``target_version`` attribute is now only supported for AArch64 and RISC-V architectures.
+
 Improvements to Clang's diagnostics
 -----------------------------------
 
@@ -746,6 +748,7 @@ Bug Fixes to C++ Support
   assumption if they also occur inside of a dependent lambda. (#GH114787)
 - Clang now uses valid deduced type locations when diagnosing functions with trailing return type
   missing placeholder return type. (#GH78694)
+- Fixed a bug where bounds of partially expanded pack indexing expressions were checked too early. (#GH116105)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 696a574833dad..1a24b8857674c 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -4390,17 +4390,17 @@ class PackIndexingExpr final
   unsigned TransformedExpressions : 31;
 
   LLVM_PREFERRED_TYPE(bool)
-  unsigned ExpandedToEmptyPack : 1;
+  unsigned FullySubstituted : 1;
 
   PackIndexingExpr(QualType Type, SourceLocation EllipsisLoc,
                    SourceLocation RSquareLoc, Expr *PackIdExpr, Expr *IndexExpr,
                    ArrayRef<Expr *> SubstitutedExprs = {},
-                   bool ExpandedToEmptyPack = false)
+                   bool FullySubstituted = false)
       : Expr(PackIndexingExprClass, Type, VK_LValue, OK_Ordinary),
         EllipsisLoc(EllipsisLoc), RSquareLoc(RSquareLoc),
         SubExprs{PackIdExpr, IndexExpr},
         TransformedExpressions(SubstitutedExprs.size()),
-        ExpandedToEmptyPack(ExpandedToEmptyPack) {
+        FullySubstituted(FullySubstituted) {
 
     auto *Exprs = getTrailingObjects<Expr *>();
     std::uninitialized_copy(SubstitutedExprs.begin(), SubstitutedExprs.end(),
@@ -4424,12 +4424,16 @@ class PackIndexingExpr final
                                   SourceLocation RSquareLoc, Expr *PackIdExpr,
                                   Expr *IndexExpr, std::optional<int64_t> Index,
                                   ArrayRef<Expr *> SubstitutedExprs = {},
-                                  bool ExpandedToEmptyPack = false);
+                                  bool FullySubstituted = false);
   static PackIndexingExpr *CreateDeserialized(ASTContext &Context,
                                               unsigned NumTransformedExprs);
 
+  bool isFullySubstituted() const { return FullySubstituted; }
+
   /// Determine if the expression was expanded to empty.
-  bool expandsToEmptyPack() const { return ExpandedToEmptyPack; }
+  bool expandsToEmptyPack() const {
+    return isFullySubstituted() && TransformedExpressions == 0;
+  }
 
   /// Determine the location of the 'sizeof' keyword.
   SourceLocation getEllipsisLoc() const { return EllipsisLoc; }
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 00c87e71bde31..d2f5267e4da5e 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -2670,8 +2670,8 @@ class OMPCompareClause final : public OMPClause {
   }
 };
 
-/// This represents 'seq_cst' clause in the '#pragma omp atomic'
-/// directive.
+/// This represents 'seq_cst' clause in the '#pragma omp atomic|flush'
+/// directives.
 ///
 /// \code
 /// #pragma omp atomic seq_cst
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 1ed5c22361ca6..90a52b1dcbf62 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -5922,12 +5922,12 @@ class PackIndexingType final
   unsigned Size : 31;
 
   LLVM_PREFERRED_TYPE(bool)
-  unsigned ExpandsToEmptyPack : 1;
+  unsigned FullySubstituted : 1;
 
 protected:
   friend class ASTContext; // ASTContext creates these.
   PackIndexingType(const ASTContext &Context, QualType Canonical,
-                   QualType Pattern, Expr *IndexExpr, bool ExpandsToEmptyPack,
+                   QualType Pattern, Expr *IndexExpr, bool FullySubstituted,
                    ArrayRef<QualType> Expansions = {});
 
 public:
@@ -5951,7 +5951,9 @@ class PackIndexingType final
 
   bool hasSelectedType() const { return getSelectedIndex() != std::nullopt; }
 
-  bool expandsToEmptyPack() const { return ExpandsToEmptyPack; }
+  bool isFullySubstituted() const { return FullySubstituted; }
+
+  bool expandsToEmptyPack() const { return isFullySubstituted() && Size == 0; }
 
   ArrayRef<QualType> getExpansions() const {
     return {getExpansionsPtr(), Size};
@@ -5965,10 +5967,10 @@ class PackIndexingType final
     if (hasSelectedType())
       getSelectedType().Profile(ID);
     else
-      Profile(ID, Context, getPattern(), getIndexExpr(), expandsToEmptyPack());
+      Profile(ID, Context, getPattern(), getIndexExpr(), isFullySubstituted());
   }
   static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
-                      QualType Pattern, Expr *E, bool ExpandsToEmptyPack);
+                      QualType Pattern, Expr *E, bool FullySubstituted);
 
 private:
   const QualType *getExpansionsPtr() const {
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index a8b9c920b617c..6f1a76bd18fb5 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -473,12 +473,12 @@ let Class = PackIndexingType in {
   def : Property<"indexExpression", ExprRef> {
     let Read = [{ node->getIndexExpr() }];
   }
-  def : Property<"expandsToEmptyPack", Bool> {
-    let Read = [{ node->expandsToEmptyPack() }];
+  def : Property<"isFullySubstituted", Bool> {
+    let Read = [{ node->isFullySubstituted() }];
   }
 
   def : Creator<[{
-    return ctx.getPackIndexingType(pattern, indexExpression, expandsToEmptyPack);
+    return ctx.getPackIndexingType(pattern, indexExpression, isFullySubstituted);
   }]>;
 }
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 634253d003256..14009826f2c55 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3297,7 +3297,7 @@ def Target : InheritableAttr {
   }];
 }
 
-def TargetVersion : InheritableAttr {
+def TargetVersion : InheritableAttr, TargetSpecificAttr<TargetArch<!listconcat(TargetAArch64.Arches, TargetRISCV.Arches)>> {
   let Spellings = [GCC<"target_version">];
   let Args = [StringArgument<"NamesStr">];
   let Subjects = SubjectList<[Function], ErrorDiag>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0b0759c6b545c..910bd6a344719 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11399,7 +11399,7 @@ def err_omp_atomic_weak_no_equality : Error<"expected '==' operator for 'weak' c
 def err_omp_atomic_several_clauses : Error<
   "directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause">;
 def err_omp_several_mem_order_clauses : Error<
-  "directive '#pragma omp %0' cannot contain more than one %select{'seq_cst', 'relaxed', |}1'acq_rel', 'acquire' or 'release' clause">;
+  "directive '#pragma omp %0' cannot contain more than one 'seq_cst',%select{ 'relaxed',|}1 'acq_rel', 'acquire' or 'release' clause">;
 def err_omp_atomic_incompatible_mem_order_clause : Error<
   "directive '#pragma omp atomic%select{ %0|}1' cannot be used with '%2' clause">;
 def note_omp_previous_mem_order_clause : Note<
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index ea6b414618c1d..056fad2cc0ff8 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4970,12 +4970,11 @@ struct FormatStyle {
   /// \version 12
   std::vector<std::string> StatementAttributeLikeMacros;
 
-  /// A vector of macros that should be interpreted as complete
-  /// statements.
+  /// A vector of macros that should be interpreted as complete statements.
   ///
-  /// Typical macros are expressions, and require a semi-colon to be
-  /// added; sometimes this is not the case, and this allows to make
-  /// clang-format aware of such cases.
+  /// Typical macros are expressions and require a semicolon to be added.
+  /// Sometimes this is not the case, and this allows to make clang-format aware
+  /// of such cases.
   ///
   /// For example: Q_UNUSED
   /// \version 8
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5fe23e0d0efd3..24abd5d95dd84 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -14257,7 +14257,7 @@ class Sema final : public SemaBase {
                                    SourceLocation EllipsisLoc, Expr *IndexExpr,
                                    SourceLocation RSquareLoc,
                                    ArrayRef<Expr *> ExpandedExprs = {},
-                                   bool EmptyPack = false);
+                                   bool FullySubstituted = false);
 
   /// Handle a C++1z fold-expression: ( expr op ... op expr ).
   ExprResult ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 3e6f0d628ca92..80e8c5b9df58e 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6223,13 +6223,11 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
                                          ArrayRef<QualType> Expansions,
                                          int Index) const {
   QualType Canonical;
-  bool ExpandsToEmptyPack = FullySubstituted && Expansions.empty();
   if (FullySubstituted && Index != -1) {
     Canonical = getCanonicalType(Expansions[Index]);
   } else {
     llvm::FoldingSetNodeID ID;
-    PackIndexingType::Profile(ID, *this, Pattern, IndexExpr,
-                              ExpandsToEmptyPack);
+    PackIndexingType::Profile(ID, *this, Pattern, IndexExpr, FullySubstituted);
     void *InsertPos = nullptr;
     PackIndexingType *Canon =
         DependentPackIndexingTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -6238,7 +6236,7 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
           PackIndexingType::totalSizeToAlloc<QualType>(Expansions.size()),
           TypeAlignment);
       Canon = new (Mem) PackIndexingType(*this, QualType(), Pattern, IndexExpr,
-                                         ExpandsToEmptyPack, Expansions);
+                                         FullySubstituted, Expansions);
       DependentPackIndexingTypes.InsertNode(Canon, InsertPos);
     }
     Canonical = QualType(Canon, 0);
@@ -6248,7 +6246,7 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
       Allocate(PackIndexingType::totalSizeToAlloc<QualType>(Expansions.size()),
                TypeAlignment);
   auto *T = new (Mem) PackIndexingType(*this, Canonical, Pattern, IndexExpr,
-                                       ExpandsToEmptyPack, Expansions);
+                                       FullySubstituted, Expansions);
   Types.push_back(T);
   return QualType(T, 0);
 }
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 3d8215ffc8c22..6add18ef4e1af 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1642,22 +1642,8 @@ bool Compiler<Emitter>::VisitImplicitValueInitExpr(
   if (QT->isIncompleteArrayType())
     return true;
 
-  if (QT->isArrayType()) {
-    const ArrayType *AT = QT->getAsArrayTypeUnsafe();
-    assert(AT);
-    const auto *CAT = cast<ConstantArrayType>(AT);
-    size_t NumElems = CAT->getZExtSize();
-    PrimType ElemT = classifyPrim(CAT->getElementType());
-
-    for (size_t I = 0; I != NumElems; ++I) {
-      if (!this->visitZeroInitializer(ElemT, CAT->getElementType(), E))
-        return false;
-      if (!this->emitInitElem(ElemT, I, E))
-        return false;
-    }
-
-    return true;
-  }
+  if (QT->isArrayType())
+    return this->visitZeroArrayInitializer(QT, E);
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
     assert(Initializing);
@@ -3916,18 +3902,9 @@ bool Compiler<Emitter>::visitZeroRecordInitializer(const Record *R,
           return false;
       }
     } else if (D->isCompositeArray()) {
-      const Record *ElemRecord = D->ElemDesc->ElemRecord;
-      assert(D->ElemDesc->ElemRecord);
-      for (uint32_t I = 0, N = D->getNumElems(); I != N; ++I) {
-        if (!this->emitConstUint32(I, E))
-          return false;
-        if (!this->emitArrayElemPtr(PT_Uint32, E))
-          return false;
-        if (!this->visitZeroRecordInitializer(ElemRecord, E))
-          return false;
-        if (!this->emitPopPtr(E))
-          return false;
-      }
+      // Can't be a vector or complex field.
+      if (!this->visitZeroArrayInitializer(D->getType(), E))
+        return false;
     } else if (D->isRecord()) {
       if (!this->visitZeroRecordInitializer(D->ElemRecord, E))
         return false;
@@ -3958,6 +3935,52 @@ bool Compiler<Emitter>::visitZeroRecordInitializer(const Record *R,
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::visitZeroArrayInitializer(QualType T, const Expr *E) {
+  assert(T->isArrayType() || T->isAnyComplexType() || T->isVectorType());
+  const ArrayType *AT = T->getAsArrayTypeUnsafe();
+  QualType ElemType = AT->getElementType();
+  size_t NumElems = cast<ConstantArrayType>(AT)->getZExtSize();
+
+  if (std::optional<PrimType> ElemT = classify(ElemType)) {
+    for (size_t I = 0; I != NumElems; ++I) {
+      if (!this->visitZeroInitializer(*ElemT, ElemType, E))
+        return false;
+      if (!this->emitInitElem(*ElemT, I, E))
+        return false;
+    }
+    return true;
+  } else if (ElemType->isRecordType()) {
+    const Record *R = getRecord(ElemType);
+
+    for (size_t I = 0; I != NumElems; ++I) {
+      if (!this->emitConstUint32(I, E))
+        return false;
+      if (!this->emitArrayElemPtr(PT_Uint32, E))
+        return false;
+      if (!this->visitZeroRecordInitializer(R, E))
+        return false;
+      if (!this->emitPopPtr(E))
+        return false;
+    }
+    return true;
+  } else if (ElemType->isArrayType()) {
+    for (size_t I = 0; I != NumElems; ++I) {
+      if (!this->emitConstUint32(I, E))
+        return false;
+      if (!this->emitArrayElemPtr(PT_Uint32, E))
+        return false;
+      if (!this->visitZeroArrayInitializer(ElemType, E))
+        return false;
+      if (!this->emitPopPtr(E))
+        return false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
 template <class Emitter>
 template <typename T>
 bool Compiler<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index d1b624daba6b9..2a94f5ec76b6c 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -325,6 +325,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   /// Emits a zero initializer.
   bool visitZeroInitializer(PrimType T, QualType QT, const Expr *E);
   bool visitZeroRecordInitializer(const Record *R, const Expr *E);
+  bool visitZeroArrayInitializer(QualType T, const Expr *E);
 
   /// Emits an APSInt constant.
   bool emitConst(const llvm::APSInt &Value, PrimType Ty, const Expr *E);
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 7f02464a1c0f1..20f67d9b1fd42 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -234,7 +234,12 @@ SourceInfo InterpFrame::getSource(CodePtr PC) const {
   if (Func && !funcHasUsableBody(Func) && Caller)
     return Caller->getSource(RetPC);
 
-  return S.getSource(Func, PC);
+  // Similarly, if the resulting source location is invalid anyway,
+  // point to the caller instead.
+  SourceInfo Result = S.getSource(Func, PC);
+  if (Result.getLoc().isInvalid() && Caller)
+    return Caller->getSource(RetPC);
+  return Result;
 }
 
 const Expr *InterpFrame::getExpr(CodePtr PC) const {
diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp
index e37ebec085195..07c4419e3cf40 100644
--- a/clang/lib/AST/ComputeDependence.cpp
+++ b/clang/lib/AST/ComputeDependence.cpp
@@ -388,9 +388,8 @@ ExprDependence clang::computeDependence(PackIndexingExpr *E) {
          ExprDependence::Instantiation;
 
   ArrayRef<Expr *> Exprs = E->getExpressions();
-  if (Exprs.empty())
+  if (Exprs.empty() || !E->isFullySubstituted())
     D |= PatternDep | ExprDependence::Instantiation;
-
   else if (!E->getIndexExpr()->isInstantiationDependent()) {
     std::optional<unsigned> Index = E->getSelectedIndex();
     assert(Index && *Index < Exprs.size() && "pack index out of bound");
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 678c9245ab46e..fc09d24fc30cb 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -1717,9 +1717,9 @@ NonTypeTemplateParmDecl *SubstNonTypeTemplateParmExpr::getParameter() const {
 PackIndexingExpr *PackIndexingExpr::Create(
     ASTContext &Context, SourceLocation EllipsisLoc, SourceLocation RSquareLoc,
     Expr *PackIdExpr, Expr *IndexExpr, std::optional<int64_t> Index,
-    ArrayRef<Expr *> SubstitutedExprs, bool ExpandedToEmptyPack) {
+    ArrayRef<Expr *> SubstitutedExprs, bool FullySubstituted) {
   QualType Type;
-  if (Index && !SubstitutedExprs.empty())
+  if (Index && FullySubstituted && !SubstitutedExprs.empty())
     Type = SubstitutedExprs[*Index]->getType();
   else
     Type = Context.DependentTy;
@@ -1728,7 +1728,7 @@ PackIndexingExpr *PackIndexingExpr::Create(
       Context.Allocate(totalSizeToAlloc<Expr *>(SubstitutedExprs.size()));
   return new (Storage)
       PackIndexingExpr(Type, EllipsisLoc, RSquareLoc, PackIdExpr, IndexExpr,
-                       SubstitutedExprs, ExpandedToEmptyPack);
+                       SubstitutedExprs, FullySubstituted);
 }
 
 NamedDecl *PackIndexingExpr::getPackDecl() const {
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index b70f86ef31442..edf20944f0b3e 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -4031,12 +4031,12 @@ void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID,
 
 PackIndexingType::PackIndexingType(const ASTContext &Context,
                                    QualType Canonical, QualType Pattern,
-                                   Expr *IndexExpr, bool ExpandsToEmptyPack,
+                                   Expr *IndexExpr, bool FullySubstituted,
                                    ArrayRef<QualType> Expansions)
     : Type(PackIndexing, Canonical,
            computeDependence(Pattern, IndexExpr, Expansions)),
       Context(Context), Pattern(Pattern), IndexExpr(IndexExpr),
-      Size(Expansions.size()), ExpandsToEmptyPack(ExpandsToEmptyPack) {
+      Size(Expansions.size()), FullySubstituted(FullySubstituted) {
 
   std::uninitialized_copy(Expansions.begin(), Expansions.end(),
                           getTrailingObjects<QualType>());
@@ -4081,10 +4081,10 @@ PackIndexingType::computeDependence(QualType Pattern, Expr *IndexExpr,
 
 void PackIndexingType::Profile(llvm::FoldingSetNodeID &ID,
                                const ASTContext &Context, QualType Pattern,
-                               Expr *E, bool ExpandsToEmptyPack) {
+                               Expr *E, bool FullySubstituted) {
   Pattern.Profile(ID);
   E->Profile(ID, Context, true);
-  ID.AddBoolean(ExpandsToEmptyPack);
+  ID.AddBoolean(FullySubstituted);
 }
 
 UnaryTransformType::UnaryTransformType(QualType BaseType,
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index fd53969e4b3b3..aed86c1fb9955 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -693,17 +693,14 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
 
   bool DisallowLineBreaksOnThisLine =
       Style.LambdaBodyIndentation == FormatStyle::LBI_Signature &&
-      Style.isCpp() && [&Current] {
-        // Deal with lambda arguments in C++. The aim here is to ensure that we
-        // don't over-indent lambda function bodies when lambdas are passed as
-        // arguments to function calls. We do this by ensuring that either all
-        // arguments (including any lambdas) go on the same line as the function
-        // call, or we break before the first argument.
-        const auto *Prev = Current.Previous;
-        if (!Prev)
-          return false;
+      // Deal with lambda arguments in C++. The aim here is to ensure that we
+      // don't over-indent lambda function bodies when lambdas are passed as
+      // arguments to function calls. We do this by ensuring that either all
+      // arguments (including any lambdas) go on the same line as the function
+      // call, or we break before the first argument.
+      Style.isCpp() && [&] {
         // For example, `/*Newline=*/false`.
-        if (Prev->is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
+        if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
           return false;
         const auto *PrevNonComment = Current.getPreviousNonComment();
         if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren))
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index ddb8ef1314aeb..ada4514f314dd 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11535,7 +11535,7 @@ StmtResult SemaOpenMP::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
     if (C->getClauseKind() == OMPC_acq_rel ||
         C->getClauseKind() == OMPC_acquire ||
         C->getClauseKind() == OMPC_release ||
-        C->getClauseKind() == OMPC_seq_cst) {
+        C->getClauseKind() == OMPC_seq_cst /*OpenMP 5.1*/) {
       if (MemOrderKind != OMPC_unknown) {
         Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses)
             << getOpenMPDirectiveName(OMPD_flush) << 1
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 2ea2a368dd24c..86d15f6324f4f 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -1157,10 +1157,12 @@ ExprResult Sema::ActOnPackIndexingExpr(Scope *S, Expr *PackExpression,
   return Res;
 }
 
-ExprResult
-Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc,
-                            Expr *IndexExpr, SourceLocation RSquareLoc,
-                            ArrayRef<Expr *> ExpandedExprs, bool EmptyPack) {
+ExprResult Sema::BuildPackIndexingExpr(Expr *PackExpression,
+                                       SourceLocation EllipsisLoc,
+                                       Expr *IndexExpr,
+                                       SourceLocation RSquareLoc,
+                                       ArrayRef<Expr *> ExpandedExprs,
+                                       bool FullySubstituted) {
 
   std::optional<int64_t> Index;
   if (!IndexExpr->isInstantiationDependent()) {
@@ -1174,8 +1176,8 @@ Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc,
     IndexExpr = Res.get();
   }
 
-  if (Index && (!ExpandedExprs.empty() || EmptyPack)) {
-    if (*Index < 0 || EmptyPack || *Index >= int64_t(ExpandedExprs.size())) {
+  if (Index && FullySubstituted) {
+    if (*Index < 0 || *Index >= int64_t(ExpandedExprs.size())) {
       Diag(PackExpression->getBeginLoc(), diag::err_pack_index_out_of_bound)
           << *Index << PackExpression << ExpandedExprs.size();
       return ExprError();
@@ -1184,7 +1186,7 @@ Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc,
 
   return PackIndexingExpr::Create(getASTContext(), EllipsisLoc, RSquareLoc,
                                   PackExpression, IndexExpr, Index,
-                                  ExpandedExprs, EmptyPack);
+                                  ExpandedExprs, FullySubstituted);
 }
 
 TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern(
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 1465bba87724b..9cf1b2d073a90 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3670,10 +3670,10 @@ class TreeTransform {
                                      SourceLocation RSquareLoc,
                                      Expr *PackIdExpression, Expr *IndexExpr,
                                      ArrayRef<Expr *> ExpandedExprs,
-                                     bool EmptyPack = false) {
+                                     bool FullySubstituted = false) {
     return getSema().BuildPackIndexingExpr(PackIdExpression, EllipsisLoc,
                                            IndexExpr, RSquareLoc, ExpandedExprs,
-                                           EmptyPack);
+                                           FullySubstituted);
   }
 
   /// Build a new expression representing a call to a source location
@@ -6769,6 +6769,7 @@ TreeTransform<Derived>::TransformPackIndexingType(TypeLocBuilder &TLB,
       if (Out.isNull())
         return QualType();
       SubtitutedTypes.push_back(Out);
+      FullySubstituted &= !Out->containsUnexpandedParameterPack();
     }
     // If we're supposed to retain a pack expansion, do so by temporarily
     // forgetting the partially-substituted parameter pack.
@@ -15581,6 +15582,7 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
   }
 
   SmallVector<Expr *, 5> ExpandedExprs;
+  bool FullySubstituted = true;
   if (!E->expandsToEmptyPack() && E->getExpressions().empty()) {
     Expr *Pattern = E->getPackIdExpression();
     SmallVector<UnexpandedParameterPack, 2> Unexpanded;
@@ -15605,7 +15607,7 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
         return ExprError();
       return getDerived().RebuildPackIndexingExpr(
           E->getEllipsisLoc(), E->getRSquareLoc(), Pack.get(), IndexExpr.get(),
-          {});
+          {}, /*FullySubstituted=*/false);
     }
     for (unsigned I = 0; I != *NumExpansions; ++I) {
       Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I);
@@ -15617,6 +15619,7 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
                                                 OrigNumExpansions);
         if (Out.isInvalid())
           return true;
+        FullySubstituted = false;
       }
       ExpandedExprs.push_back(Out.get());
     }
@@ -15633,6 +15636,7 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
                                               OrigNumExpansions);
       if (Out.isInvalid())
         return true;
+      FullySubstituted = false;
       ExpandedExprs.push_back(Out.get());
     }
   } else if (!E->expandsToEmptyPack()) {
@@ -15644,8 +15648,7 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
 
   return getDerived().RebuildPackIndexingExpr(
       E->getEllipsisLoc(), E->getRSquareLoc(), E->getPackIdExpression(),
-      IndexExpr.get(), ExpandedExprs,
-      /*EmptyPack=*/ExpandedExprs.size() == 0);
+      IndexExpr.get(), ExpandedExprs, FullySubstituted);
 }
 
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index c39a1950a6cf2..731ad0b64dc85 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2191,7 +2191,7 @@ void ASTStmtReader::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
 void ASTStmtReader::VisitPackIndexingExpr(PackIndexingExpr *E) {
   VisitExpr(E);
   E->TransformedExpressions = Record.readInt();
-  E->ExpandedToEmptyPack = Record.readInt();
+  E->FullySubstituted = Record.readInt();
   E->EllipsisLoc = readSourceLocation();
   E->RSquareLoc = readSourceLocation();
   E->SubExprs[0] = Record.readStmt();
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index e7f567ff59a8a..4994047d9fe10 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2191,7 +2191,7 @@ void ASTStmtWriter::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
 void ASTStmtWriter::VisitPackIndexingExpr(PackIndexingExpr *E) {
   VisitExpr(E);
   Record.push_back(E->TransformedExpressions);
-  Record.push_back(E->ExpandedToEmptyPack);
+  Record.push_back(E->FullySubstituted);
   Record.AddSourceLocation(E->getEllipsisLoc());
   Record.AddSourceLocation(E->getRSquareLoc());
   Record.AddStmt(E->getPackIdExpression());
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index 56f54ff168f3e..7a4fc89a27dac 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fexperimental-new-constant-interpreter -verify=expected,both %s -DBYTECODE
 // RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -verify=ref,both %s
 
 namespace std {
@@ -338,3 +338,17 @@ namespace PR48606 {
   }
   static_assert(f());
 }
+
+#ifdef BYTECODE
+constexpr int N = [] // expected-error {{must be initialized by a constant expression}} \
+                     // expected-note {{assignment to dereferenced one-past-the-end pointer is not allowed in a constant expression}} \
+                     // expected-note {{in call to}}
+{
+    struct S {
+        int a[1];
+    };
+    S s;
+    ::new (s.a) int[1][2][3][4]();
+    return s.a[0];
+}();
+#endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 3bc6107b7fd40..c22a43146a8c8 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
 
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
 // CHECK-LABEL test_wavefrontsize(
 unsigned test_wavefrontsize() {
 
-  // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+  // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+  // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
   return __builtin_amdgcn_wavefrontsize();
 }
 
diff --git a/clang/test/OpenMP/flush_ast_print.cpp b/clang/test/OpenMP/flush_ast_print.cpp
index 9578ada020227..768282422032f 100644
--- a/clang/test/OpenMP/flush_ast_print.cpp
+++ b/clang/test/OpenMP/flush_ast_print.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t  -verify %s -ast-print | FileCheck %s
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -19,6 +19,7 @@ T tmain(T argc) {
 #pragma omp flush acq_rel
 #pragma omp flush acquire
 #pragma omp flush release
+#pragma omp flush seq_cst
 #pragma omp flush(a)
   return a + argc;
 }
@@ -27,18 +28,21 @@ T tmain(T argc) {
 // CHECK-NEXT: #pragma omp flush acq_rel{{$}}
 // CHECK-NEXT: #pragma omp flush acquire{{$}}
 // CHECK-NEXT: #pragma omp flush release{{$}}
+// CHECK-NEXT: #pragma omp flush seq_cst{{$}}
 // CHECK-NEXT: #pragma omp flush (a)
 // CHECK:      static int a;
 // CHECK-NEXT: #pragma omp flush
 // CHECK-NEXT: #pragma omp flush acq_rel{{$}}
 // CHECK-NEXT: #pragma omp flush acquire{{$}}
 // CHECK-NEXT: #pragma omp flush release{{$}}
+// CHECK-NEXT: #pragma omp flush seq_cst{{$}}
 // CHECK-NEXT: #pragma omp flush (a)
 // CHECK:      static char a;
 // CHECK-NEXT: #pragma omp flush
 // CHECK-NEXT: #pragma omp flush acq_rel{{$}}
 // CHECK-NEXT: #pragma omp flush acquire{{$}}
 // CHECK-NEXT: #pragma omp flush release{{$}}
+// CHECK-NEXT: #pragma omp flush seq_cst{{$}}
 // CHECK-NEXT: #pragma omp flush (a)
 
 int main(int argc, char **argv) {
@@ -48,11 +52,13 @@ int main(int argc, char **argv) {
 #pragma omp flush acq_rel
 #pragma omp flush acquire
 #pragma omp flush release
+#pragma omp flush seq_cst
 #pragma omp flush(a)
 // CHECK-NEXT: #pragma omp flush
 // CHECK-NEXT: #pragma omp flush acq_rel
 // CHECK-NEXT: #pragma omp flush acquire{{$}}
 // CHECK-NEXT: #pragma omp flush release
+// CHECK-NEXT: #pragma omp flush seq_cst
 // CHECK-NEXT: #pragma omp flush (a)
   return tmain(argc) + tmain(argv[0][0]) + a;
 }
diff --git a/clang/test/OpenMP/flush_codegen.cpp b/clang/test/OpenMP/flush_codegen.cpp
index c7dd88ef9ac31..fa2586d9fe258 100644
--- a/clang/test/OpenMP/flush_codegen.cpp
+++ b/clang/test/OpenMP/flush_codegen.cpp
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 #ifndef HEADER
@@ -17,6 +17,7 @@ template <class T>
 T tmain(T argc) {
   static T a;
 #pragma omp flush
+#pragma omp flush seq_cst
 #pragma omp flush acq_rel
 #pragma omp flush acquire
 #pragma omp flush release
@@ -28,6 +29,7 @@ T tmain(T argc) {
 int main() {
   static int a;
 #pragma omp flush
+#pragma omp flush seq_cst
 #pragma omp flush acq_rel
 #pragma omp flush acquire
 #pragma omp flush release
@@ -37,6 +39,7 @@ int main() {
   // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
   // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
   // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
+  // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
   return tmain(a);
   // CHECK: call {{.*}} [[TMAIN:@.+]](
   // CHECK: ret
@@ -48,6 +51,7 @@ int main() {
 // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
 // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
 // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
+// CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}})
 // CHECK: ret
 
 // CHECK-NOT: line: 0,
diff --git a/clang/test/OpenMP/flush_messages.cpp b/clang/test/OpenMP/flush_messages.cpp
index 3e9142625b274..e78949bc924e1 100644
--- a/clang/test/OpenMP/flush_messages.cpp
+++ b/clang/test/OpenMP/flush_messages.cpp
@@ -134,14 +134,12 @@ label1 : {
 #pragma omp flush(argc) flush(argc) // expected-warning {{extra tokens at the end of '#pragma omp flush' are ignored}}
 #pragma omp parallel flush(argc) // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
   ;
-#pragma omp flush seq_cst // omp45-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp flush'}}
 #pragma omp flush acq_rel // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}}
 #pragma omp flush acquire // omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}}
 #pragma omp flush release // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}}
 #pragma omp flush relaxed // expected-error {{unexpected OpenMP clause 'relaxed' in directive '#pragma omp flush'}}
-#pragma omp flush seq_cst // omp45-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp flush'}}
-#pragma omp flush acq_rel acquire // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'acq_rel' clause used here}}
-#pragma omp flush release acquire // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'release' clause used here}}
+#pragma omp flush acq_rel acquire // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'seq_cst', 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'acq_rel' clause used here}}
+#pragma omp flush release acquire // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'seq_cst', 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'release' clause used here}}
 #pragma omp flush acq_rel (argc) // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} expected-warning {{extra tokens at the end of '#pragma omp flush' are ignored}}
 #pragma omp flush(argc) acq_rel // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp51-error {{'flush' directive with memory order clause 'acq_rel' cannot have the list}} omp51-note {{memory order clause 'acq_rel' is specified here}}
   return tmain(argc);
diff --git a/clang/test/Sema/attr-target-version-unsupported.c b/clang/test/Sema/attr-target-version-unsupported.c
new file mode 100644
index 0000000000000..7cf8172f5272e
--- /dev/null
+++ b/clang/test/Sema/attr-target-version-unsupported.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify %s
+
+//expected-warning@+1 {{unknown attribute 'target_version' ignored}}
+int __attribute__((target_version("aes"))) foo(void) { return 3; }
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index 962dbb8137f28..cb679a6c3ad87 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -271,3 +271,37 @@ void f() {
 }
 
 } // namespace GH105903
+
+namespace GH116105 {
+
+template <unsigned long Np, class... Ts> using pack_type = Ts...[Np];
+
+template <unsigned long Np, auto... Ts> using pack_expr = decltype(Ts...[Np]);
+
+template <class...> struct types;
+
+template <class, long... Is> struct indices;
+
+template <class> struct repack;
+
+template <long... Idx> struct repack<indices<long, Idx...>> {
+  template <class... Ts>
+  using pack_type_alias = types<pack_type<Idx, Ts...>...>;
+
+  template <class... Ts>
+  using pack_expr_alias = types<pack_expr<Idx, Ts{}...>...>;
+};
+
+template <class... Args> struct mdispatch_ {
+  using Idx = __make_integer_seq<indices, long, sizeof...(Args)>;
+
+  static_assert(__is_same(
+      typename repack<Idx>::template pack_type_alias<Args...>, types<Args...>));
+
+  static_assert(__is_same(
+      typename repack<Idx>::template pack_expr_alias<Args...>, types<Args...>));
+};
+
+mdispatch_<int, int> d;
+
+} // namespace GH116105
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index cc735e4872592..5481bb6b87503 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -178,7 +178,7 @@ enum class WNoError { Unknown };
 
 static cl::bits<WNoError> WNoErrorList(
     "Wno-error",
-    cl::desc("If set don't error out on the specified warning type."),
+    cl::desc("If set, don't error out on the specified warning type."),
     cl::values(
         clEnumValN(WNoError::Unknown, "unknown",
                    "If set, unknown format options are only warned about.\n"
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index ac81beee11a39..8b8ce1abe906f 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -816,6 +816,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
                       //   mov rax, QWORD PTR [rip + XXXXXXXX]
     case 0x058d48:    // 48 8d 05 XX XX XX XX :
                       //   lea rax, QWORD PTR [rip + XXXXXXXX]
+    case 0x0d8948:    // 48 89 0d XX XX XX XX :
+                      //   mov QWORD PTR [rip + XXXXXXXX], rcx
+    case 0x158948:    // 48 89 15 XX XX XX XX :
+                      //   mov QWORD PTR [rip + XXXXXXXX], rdx
     case 0x25ff48:    // 48 ff 25 XX XX XX XX :
                       //   rex.W jmp QWORD PTR [rip + XXXXXXXX]
     case 0x158D4C:    // 4c 8d 15 XX XX XX XX : lea r10, [rip + XX]
diff --git a/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90 b/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90
new file mode 100644
index 0000000000000..753e1cfcd7aa5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90
@@ -0,0 +1,6 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause SEQ_CST in FLUSH construct
+program flush_seq_cst
+  !$omp flush seq_cst
+end program
\ No newline at end of file
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 406d30b38948e..8dd6d10200cd3 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=50
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=51
 use omp_lib
 ! Check OpenMP clause validity for the following directives:
 !
@@ -507,7 +507,6 @@
   !$omp flush acquire
   !ERROR: If memory-order-clause is RELEASE, ACQUIRE, or ACQ_REL, list items must not be specified on the FLUSH directive
   !$omp flush release (c)
-  !ERROR: SEQ_CST clause is not allowed on the FLUSH directive
   !$omp flush seq_cst
   !ERROR: RELAXED clause is not allowed on the FLUSH directive
   !$omp flush relaxed
diff --git a/flang/test/Semantics/OpenMP/flush02.f90 b/flang/test/Semantics/OpenMP/flush02.f90
index ed0cf6602d574..615332c6cf31c 100644
--- a/flang/test/Semantics/OpenMP/flush02.f90
+++ b/flang/test/Semantics/OpenMP/flush02.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
 
 ! Check OpenMP 5.0 - 2.17.8 flush Construct
 ! Restriction -
@@ -27,7 +27,6 @@
   !Only memory-order-clauses.
   if (omp_get_thread_num() == 1) THEN
     ! Not allowed clauses.
-    !ERROR: SEQ_CST clause is not allowed on the FLUSH directive
     !$omp flush seq_cst
     !ERROR: RELAXED clause is not allowed on the FLUSH directive
     !$omp flush relaxed
@@ -41,7 +40,6 @@
     !$omp flush acquire acquire
 
     ! Mix of allowed and not allowed.
-    !ERROR: SEQ_CST clause is not allowed on the FLUSH directive
     !$omp flush seq_cst acquire
   END IF
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 1f69e4c9acbbd..75121285b7b23 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1094,7 +1094,7 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf,
     // R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC
     // sections.
     if (LLVM_LIKELY(expr == R_ABS) || expr == R_DTPREL || expr == R_GOTPLTREL ||
-        expr == R_RISCV_ADD) {
+        expr == R_RISCV_ADD || expr == R_ARM_SBREL) {
       target.relocateNoSym(bufLoc, type,
                            SignExtend64<bits>(sym.getVA(ctx, addend)));
       continue;
diff --git a/lld/test/ELF/arm-rwpi-debug-relocs.s b/lld/test/ELF/arm-rwpi-debug-relocs.s
new file mode 100644
index 0000000000000..2bb968d4afa9a
--- /dev/null
+++ b/lld/test/ELF/arm-rwpi-debug-relocs.s
@@ -0,0 +1,54 @@
+/// Test that R_ARM_SBREL32 relocations in debug info are relocated as if the
+/// static base register (r9) is zero. Real DWARF info will use an expression to
+/// add this to the real value of the static base at runtime.
+
+// REQUIRES: arm
+// RUN: rm -rf %t && split-file %s %t && cd %t
+
+// RUN: llvm-mc -filetype=obj -triple=armv7a asm.s -o obj.o
+// RUN: ld.lld -T lds.ld obj.o -o exe.elf 2>&1 | FileCheck %s --implicit-check-not=warning: --allow-empty
+// RUN: llvm-objdump -D exe.elf | FileCheck --check-prefix=DISASM %s
+
+// DISASM-LABEL: <rw>:
+// DISASM-NEXT:      1000: 0000002a
+
+// DISASM-LABEL: <rw2>:
+// DISASM-NEXT:      2000: 000004d2
+
+// DISASM-LABEL: <.debug_something>:
+// DISASM-NEXT:         0: 00001000
+// DISASM-NEXT:        ...
+// DISASM-NEXT:       104: 00002000
+
+//--- lds.ld
+SECTIONS {
+  data1 0x1000 : { *(data1) }
+  data2 0x2000 : { *(data2) }
+}
+
+//--- asm.s
+  .text
+  .type _start,%function
+  .globl  _start
+_start:
+  bx lr
+  .size _start, .-_start
+
+  .section data1, "aw", %progbits
+  .type rw,%object
+  .globl  rw
+rw:
+  .long 42
+  .size rw, 4
+
+  .section data2, "aw", %progbits
+  .type rw2,%object
+  .globl  rw2
+rw2:
+  .long 1234
+  .size rw2, 4
+
+  .section  .debug_something, "", %progbits
+  .long rw(sbrel)
+  .space 0x100
+  .long rw2(sbrel)
diff --git a/lldb/docs/use/aarch64-linux.md b/lldb/docs/use/aarch64-linux.md
index 70432f57857a5..393838dc0bb4f 100644
--- a/lldb/docs/use/aarch64-linux.md
+++ b/lldb/docs/use/aarch64-linux.md
@@ -17,7 +17,7 @@ In LLDB you will be able to see the following new registers:
 
 * `z0-z31` vector registers, each one has size equal to the vector length.
 * `p0-p15` predicate registers, each one containing 1 bit per byte in the vector
-  length. Making each one vector length / 8 sized.
+  length. So each one is `vector length in bits / 8` bits.
 * `ffr` the first fault register, same size as a predicate register.
 * `vg`, the vector length in "granules". Each granule is 8 bytes.
 
diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py
index 660a3c085a908..07b5f8cc7d900 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py
@@ -1158,17 +1158,6 @@ def GetModuleName(i):
     return list(map(GetModuleName, list(range(thread.GetNumFrames()))))
 
 
-def get_stack_frames(thread):
-    """
-    Returns a sequence of stack frames for this thread.
-    """
-
-    def GetStackFrame(i):
-        return thread.GetFrameAtIndex(i)
-
-    return list(map(GetStackFrame, list(range(thread.GetNumFrames()))))
-
-
 def print_stacktrace(thread, string_buffer=False):
     """Prints a simple stack trace of this thread."""
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
index c18edd10b9681..30c890d6d0138 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
@@ -137,9 +137,19 @@ void DWARFIndex::GetTypesWithQuery(
 bool DWARFIndex::ProcessTypeDIEMatchQuery(
     TypeQuery &query, DWARFDIE die,
     llvm::function_ref<bool(DWARFDIE die)> callback) {
-  // Nothing to match from query
-  if (query.GetContextRef().size() <= 1)
+  // Check the language, but only if we have a language filter.
+  if (query.HasLanguage() &&
+      !query.LanguageMatches(SymbolFileDWARF::GetLanguageFamily(*die.GetCU())))
+    return true; // Keep iterating over index types, language mismatch.
+
+  // Since mangled names are unique, we only need to check if the names are
+  // the same.
+  if (query.GetSearchByMangledName()) {
+    if (die.GetMangledName(/*substitute_name_allowed=*/false) !=
+        query.GetTypeBasename().GetStringRef())
+      return true; // Keep iterating over index types, mangled name mismatch.
     return callback(die);
+  }
 
   std::vector<lldb_private::CompilerContext> die_context;
   if (query.GetModuleSearch())
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 8ce0db4588a46..c900f330b481b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2726,39 +2726,8 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
   TypeQuery query_full(query);
   bool have_index_match = false;
   m_index->GetTypesWithQuery(query_full, [&](DWARFDIE die) {
-    // Check the language, but only if we have a language filter.
-    if (query.HasLanguage()) {
-      if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU())))
-        return true; // Keep iterating over index types, language mismatch.
-    }
-
-    // Since mangled names are unique, we only need to check if the names are
-    // the same.
-    if (query.GetSearchByMangledName()) {
-      if (die.GetMangledName(/*substitute_name_allowed=*/false) !=
-          query.GetTypeBasename().GetStringRef())
-        return true; // Keep iterating over index types, mangled name mismatch.
-      if (Type *matching_type = ResolveType(die, true, true)) {
-        results.InsertUnique(matching_type->shared_from_this());
-        return !results.Done(query); // Keep iterating if we aren't done.
-      }
-      return true; // Keep iterating over index types, weren't able to resolve
-                   // this type
-    }
-
-    // Check the context matches
-    std::vector<lldb_private::CompilerContext> die_context;
-    if (query.GetModuleSearch())
-      die_context = die.GetDeclContext();
-    else
-      die_context = die.GetTypeLookupContext();
-    assert(!die_context.empty());
-    if (!query.ContextMatches(die_context))
-      return true; // Keep iterating over index types, context mismatch.
-
-    // Try to resolve the type.
     if (Type *matching_type = ResolveType(die, true, true)) {
-      if (matching_type->IsTemplateType()) {
+      if (!query.GetSearchByMangledName() && matching_type->IsTemplateType()) {
         // We have to watch out for case where we lookup a type by basename and
         // it matches a template with simple template names. Like looking up
         // "Foo" and if we have simple template names then we will match
@@ -2790,7 +2759,7 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
   // With -gsimple-template-names, a templated type's DW_AT_name will not
   // contain the template parameters. Try again stripping '<' and anything
   // after, filtering out entries with template parameters that don't match.
-  if (!have_index_match) {
+  if (!have_index_match && !query.GetSearchByMangledName()) {
     // Create a type matcher with a compiler context that is tuned for
     // -gsimple-template-names. We will use this for the index lookup and the
     // context matching, but will use the original "match" to insert matches
@@ -2804,23 +2773,6 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
       // Copy our match's context and update the basename we are looking for
       // so we can use this only to compare the context correctly.
       m_index->GetTypesWithQuery(query_simple, [&](DWARFDIE die) {
-        // Check the language, but only if we have a language filter.
-        if (query.HasLanguage()) {
-          if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU())))
-            return true; // Keep iterating over index types, language mismatch.
-        }
-
-        // Check the context matches
-        std::vector<lldb_private::CompilerContext> die_context;
-        if (query.GetModuleSearch())
-          die_context = die.GetDeclContext();
-        else
-          die_context = die.GetTypeLookupContext();
-        assert(!die_context.empty());
-        if (!query_simple.ContextMatches(die_context))
-          return true; // Keep iterating over index types, context mismatch.
-
-        // Try to resolve the type.
         if (Type *matching_type = ResolveType(die, true, true)) {
           ConstString name = matching_type->GetQualifiedName();
           // We have found a type that still might not match due to template
diff --git a/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py b/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py
index f1c23a58d1f48..dbb9576ed4d51 100644
--- a/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py
+++ b/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py
@@ -59,7 +59,7 @@ def test_source_line_breakpoint(self):
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
         thread = process.GetSelectedThread()
-        stack_frames = lldbutil.get_stack_frames(thread)
+        stack_frames = thread.frames
         self.assertGreater(len(stack_frames), 2)
 
         leaf_frame = stack_frames[0]
@@ -97,7 +97,7 @@ def test_symbolic_breakpoint(self):
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
         thread = process.GetSelectedThread()
-        stack_frames = lldbutil.get_stack_frames(thread)
+        stack_frames = thread.frames
         self.assertGreater(len(stack_frames), 2)
 
         leaf_frame = stack_frames[0]
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 71dd4c6b75b63..c3f98c9bbdff5 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -121,8 +121,8 @@ david.trevelyan@gmail.com (email), [davidtrevelyan](https://github.com/davidtrev
 
 #### Parts of code generator not covered by someone else
 
-Evan Cheng \
-evan.cheng@apple.com (email)
+Matt Arsenault \
+Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.com/arsenm) (GitHub)
 
 #### SelectionDAG
 
@@ -469,6 +469,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 ### Inactive or former component maintainers
 
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
+Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 
 ### Former maintainers of removed components
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3670228a10835..fabe2dfd33178 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21523,9 +21523,9 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <16 x i32>  @llvm.vp.abs.v16i32 (<16 x i32> <op>, <16 x i1> <mask>, i32 <vector_length>, i1 <is_int_min_poison>)
-      declare <vscale x 4 x i32>  @llvm.vp.abs.nxv4i32 (<vscale x 4 x i32> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>, i1 <is_int_min_poison>)
-      declare <256 x i64>  @llvm.vp.abs.v256i64 (<256 x i64> <op>, <256 x i1> <mask>, i32 <vector_length>, i1 <is_int_min_poison>)
+      declare <16 x i32>  @llvm.vp.abs.v16i32 (<16 x i32> <op>, i1 <is_int_min_poison>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.abs.nxv4i32 (<vscale x 4 x i32> <op>, i1 <is_int_min_poison>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.abs.v256i64 (<256 x i64> <op>, i1 <is_int_min_poison>, <256 x i1> <mask>, i32 <vector_length>)
 
 Overview:
 """""""""
@@ -21537,12 +21537,12 @@ Arguments:
 """"""""""
 
 The first argument and the result have the same vector of integer type. The
-second argument is the vector mask and has the same number of elements as the
-result vector type. The third argument is the explicit vector length of the
-operation. The fourth argument must be a constant and is a flag to indicate
-whether the result value of the '``llvm.vp.abs``' intrinsic is a
-:ref:`poison value <poisonvalues>` if the first argument is statically or
-dynamically an ``INT_MIN`` value.
+second argument must be a constant and is a flag to indicate whether the result
+value of the '``llvm.vp.abs``' intrinsic is a :ref:`poison value <poisonvalues>`
+if the first argument is statically or dynamically an ``INT_MIN`` value. The
+third argument is the vector mask and has the same number of elements as the
+result vector type. The fourth argument is the explicit vector length of the
+operation.
 
 Semantics:
 """"""""""
@@ -21555,7 +21555,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %r = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false)
+      %r = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a, i1 false, <4 x i1> %mask, i32 %evl)
       ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
 
       %t = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false)
@@ -25261,9 +25261,9 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <16 x i32>  @llvm.vp.ctlz.v16i32 (<16 x i32> <op>, <16 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
-      declare <vscale x 4 x i32>  @llvm.vp.ctlz.nxv4i32 (<vscale x 4 x i32> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
-      declare <256 x i64>  @llvm.vp.ctlz.v256i64 (<256 x i64> <op>, <256 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <16 x i32>  @llvm.vp.ctlz.v16i32 (<16 x i32> <op>, i1 <is_zero_poison>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.ctlz.nxv4i32 (<vscale x 4 x i32> <op>, i1 <is_zero_poison>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.ctlz.v256i64 (<256 x i64> <op>, i1 <is_zero_poison>, <256 x i1> <mask>, i32 <vector_length>)
 
 Overview:
 """""""""
@@ -25275,11 +25275,11 @@ Arguments:
 """"""""""
 
 The first argument and the result have the same vector of integer type. The
-second argument is the vector mask and has the same number of elements as the
-result vector type. The third argument is the explicit vector length of the
-operation. The fourth argument is a constant flag that indicates whether the
-intrinsic returns a valid result if the first argument is zero. If the first
-argument is zero and the fourth argument is true, the result is poison.
+second argument is a constant flag that indicates whether the intrinsic returns
+a valid result if the first argument is zero. The third argument is the vector
+mask and has the same number of elements as the result vector type. the fourth
+argument is the explicit vector length of the operation. If the first argument
+is zero and the second argument is true, the result is poison.
 
 Semantics:
 """"""""""
@@ -25292,7 +25292,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %r = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false)
+      %r = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %a, , i1 false, <4 x i1> %mask, i32 %evl)
       ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
 
       %t = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
@@ -25310,9 +25310,9 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <16 x i32>  @llvm.vp.cttz.v16i32 (<16 x i32> <op>, <16 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
-      declare <vscale x 4 x i32>  @llvm.vp.cttz.nxv4i32 (<vscale x 4 x i32> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
-      declare <256 x i64>  @llvm.vp.cttz.v256i64 (<256 x i64> <op>, <256 x i1> <mask>, i32 <vector_length>, i1 <is_zero_poison>)
+      declare <16 x i32>  @llvm.vp.cttz.v16i32 (<16 x i32> <op>, i1 <is_zero_poison>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.cttz.nxv4i32 (<vscale x 4 x i32> <op>, i1 <is_zero_poison>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.cttz.v256i64 (<256 x i64> <op>, i1 <is_zero_poison>, <256 x i1> <mask>, i32 <vector_length>)
 
 Overview:
 """""""""
@@ -25324,11 +25324,11 @@ Arguments:
 """"""""""
 
 The first argument and the result have the same vector of integer type. The
-second argument is the vector mask and has the same number of elements as the
-result vector type. The third argument is the explicit vector length of the
-operation. The fourth argument is a constant flag that indicates whether the
-intrinsic returns a valid result if the first argument is zero. If the first
-argument is zero and the fourth argument is true, the result is poison.
+second argument is a constant flag that indicates whether the intrinsic
+returns a valid result if the first argument is zero. The third argument is
+the vector mask and has the same number of elements as the result vector type.
+The fourth argument is the explicit vector length of the operation. If the
+first argument is zero and the second argument is true, the result is poison.
 
 Semantics:
 """"""""""
@@ -25341,7 +25341,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %r = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false)
+      %r = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %a, i1 false, <4 x i1> %mask, i32 %evl)
       ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
 
       %t = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d3d68ff1c6ed2..d2fc40d8ae037 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -277,6 +277,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
   }
 
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const {
+    const TargetMachine &TM = getTLI()->getTargetMachine();
+
+    const FeatureBitset &CallerBits =
+        TM.getSubtargetImpl(*Caller)->getFeatureBits();
+    const FeatureBitset &CalleeBits =
+        TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+    // Inline a callee if its target-features are a subset of the callers
+    // target-features.
+    return (CallerBits & CalleeBits) == CalleeBits;
+  }
+
   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
 
   bool isSourceOfDivergence(const Value *V) { return false; }
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 73a9586aaa644..dd4d90864a08c 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -1117,21 +1117,20 @@ template <typename FrameIdTy> class CallStackRadixTreeBuilder {
 
   // Encode a call stack into RadixArray.  Return the starting index within
   // RadixArray.
-  LinearCallStackId
-  encodeCallStack(const llvm::SmallVector<FrameIdTy> *CallStack,
-                  const llvm::SmallVector<FrameIdTy> *Prev,
-                  std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
-                      MemProfFrameIndexes);
+  LinearCallStackId encodeCallStack(
+      const llvm::SmallVector<FrameIdTy> *CallStack,
+      const llvm::SmallVector<FrameIdTy> *Prev,
+      const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes);
 
 public:
   CallStackRadixTreeBuilder() = default;
 
   // Build a radix tree array.
-  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
-                 &&MemProfCallStackData,
-             std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
-                 MemProfFrameIndexes,
-             llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram);
+  void
+  build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
+            &&MemProfCallStackData,
+        const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
+        llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram);
 
   ArrayRef<LinearFrameId> getRadixArray() const { return RadixArray; }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ba5d93f5eb55d..3e1071fe71a2a 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4363,7 +4363,7 @@ static DenseMap<CallStackId, LinearCallStackId> writeMemoryProfileRadixTree(
   CallStackRadixTreeBuilder<LinearFrameId> Builder;
   // We don't need a MemProfFrameIndexes map as we have already converted the
   // full stack id hash to a linear offset into the StackIds array.
-  Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/std::nullopt,
+  Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/nullptr,
                 FrameHistogram);
   Stream.EmitRecord(bitc::FS_CONTEXT_RADIX_TREE_ARRAY, Builder.getRadixArray(),
                     RadixAbbrev);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 3910046a1652b..b08a93ae9a6d5 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -3033,7 +3033,11 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             if (!MOP.getReg().isPhysical())
               continue;
 
-            if (llvm::is_contained(TRI->subregs(MOP.getReg()), Reg))
+            if (MOP.getReg() != Reg &&
+                all_of(TRI->regunits(Reg), [&](const MCRegUnit RegUnit) {
+                  return llvm::is_contained(TRI->regunits(MOP.getReg()),
+                                            RegUnit);
+                }))
               Bad = false;
           }
         }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 521829675ae7c..57cce9f785263 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       const SDLoc &DL, bool LegalOperations) {
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+                                       const SDLoc &DL) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+  unsigned Opc = Vec.getOpcode();
+  if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() ||
       Vec->getNumValues() != 1)
     return SDValue();
 
+  EVT ResVT = ExtElt->getValueType(0);
+  if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType())
+    return SDValue();
+
   // Targets may want to avoid this to prevent an expensive register transfer.
   if (!TLI.shouldScalarizeBinop(Vec))
     return SDValue();
@@ -22766,19 +22771,23 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Op0 = Vec.getOperand(0);
   SDValue Op1 = Vec.getOperand(1);
   APInt SplatVal;
-  if (isAnyConstantBuildVector(Op0, true) ||
-      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
-      isAnyConstantBuildVector(Op1, true) ||
-      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
-    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
-    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    EVT VT = ExtElt->getValueType(0);
-    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
-    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
-  }
+  if (!isAnyConstantBuildVector(Op0, true) &&
+      !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) &&
+      !isAnyConstantBuildVector(Op1, true) &&
+      !ISD::isConstantSplatVector(Op1.getNode(), SplatVal))
+    return SDValue();
 
-  return SDValue();
+  // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C'
+  // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC)
+  EVT OpVT = Op0->getValueType(0).getVectorElementType();
+  Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index);
+  Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index);
+
+  if (Opc == ISD::SETCC)
+    return DAG.getSetCC(DL, ResVT, Op0, Op1,
+                        cast<CondCodeSDNode>(Vec->getOperand(2))->get());
+  else
+    return DAG.getNode(Opc, DL, ResVT, Op0, Op1);
 }
 
 // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23011,7 +23020,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 1480bd98c685e..fbc96bade15f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2330,10 +2330,10 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem);
   const int Precision = APFloat::semanticsPrecision(FltSem);
 
-  const SDValue MaxExp = DAG.getConstant(MaxExpVal, dl, ExpVT);
-  const SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT);
+  const SDValue MaxExp = DAG.getSignedConstant(MaxExpVal, dl, ExpVT);
+  const SDValue MinExp = DAG.getSignedConstant(MinExpVal, dl, ExpVT);
 
-  const SDValue DoubleMaxExp = DAG.getConstant(2 * MaxExpVal, dl, ExpVT);
+  const SDValue DoubleMaxExp = DAG.getSignedConstant(2 * MaxExpVal, dl, ExpVT);
 
   const APFloat One(FltSem, "1.0");
   APFloat ScaleUpK = scalbn(One, MaxExpVal, APFloat::rmNearestTiesToEven);
@@ -2375,7 +2375,7 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDValue IncN0 = DAG.getNode(ISD::ADD, dl, ExpVT, N, Increment0, NUW_NSW);
 
   SDValue ClampMinVal =
-      DAG.getConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT);
+      DAG.getSignedConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT);
   SDValue ClampN_Small = DAG.getNode(ISD::SMAX, dl, ExpVT, N, ClampMinVal);
   SDValue IncN1 =
       DAG.getNode(ISD::ADD, dl, ExpVT, ClampN_Small, Increment1, NSW);
@@ -2385,8 +2385,8 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDValue ScaleDown1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleDown0, ScaleDownVal);
 
   SDValue ScaleDownTwice = DAG.getSetCC(
-      dl, SetCCVT, N, DAG.getConstant(2 * MinExpVal + Precision, dl, ExpVT),
-      ISD::SETULT);
+      dl, SetCCVT, N,
+      DAG.getSignedConstant(2 * MinExpVal + Precision, dl, ExpVT), ISD::SETULT);
 
   SDValue SelectN_Small =
       DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleDownTwice, IncN1, IncN0);
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 725ff9256fd4a..d8ab18d213e3d 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -351,9 +351,14 @@ bool InstrProfWriter::addMemProfCallStack(
 
 bool InstrProfWriter::addMemProfData(memprof::IndexedMemProfData Incoming,
                                      function_ref<void(Error)> Warn) {
-  // TODO: Once we remove support for MemProf format Version V1, assert that
-  // the three components (frames, call stacks, and records) are either all
-  // empty or populated.
+  // Return immediately if everything is empty.
+  if (Incoming.Frames.empty() && Incoming.CallStacks.empty() &&
+      Incoming.Records.empty())
+    return true;
+
+  // Otherwise, every component must be non-empty.
+  assert(!Incoming.Frames.empty() && !Incoming.CallStacks.empty() &&
+         !Incoming.Records.empty());
 
   if (MemProfData.Frames.empty())
     MemProfData.Frames = std::move(Incoming.Frames);
@@ -636,7 +641,7 @@ writeMemProfCallStackArray(
       MemProfCallStackIndexes;
 
   memprof::CallStackRadixTreeBuilder<memprof::FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
                 FrameHistogram);
   for (auto I : Builder.getRadixArray())
     OS.write32(I);
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 20cc4eececc9b..1c240c3858cc7 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -335,8 +335,7 @@ template <typename FrameIdTy>
 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
     const llvm::SmallVector<FrameIdTy> *CallStack,
     const llvm::SmallVector<FrameIdTy> *Prev,
-    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
-        MemProfFrameIndexes) {
+    const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
   // Compute the length of the common root prefix between Prev and CallStack.
   uint32_t CommonLen = 0;
   if (Prev) {
@@ -381,8 +380,7 @@ template <typename FrameIdTy>
 void CallStackRadixTreeBuilder<FrameIdTy>::build(
     llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
         &&MemProfCallStackData,
-    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
-        MemProfFrameIndexes,
+    const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
     llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9e965277a..d51b36f7e4994 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  bool shouldScalarizeBinop(SDValue VecOp) const override {
+    return VecOp.getOpcode() == ISD::SETCC;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ec7bb71fd111f..7a1e401bca18c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -266,16 +266,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
       return false;
   }
 
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  // Inline a callee if its target-features are a subset of the callers
-  // target-features.
-  return (CallerBits & CalleeBits) == CalleeBits;
+  return BaseT::areInlineCompatible(Caller, Callee);
 }
 
 bool AArch64TTIImpl::areTypesABICompatible(
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9c1bbafd337e9..ad31f29c04599 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1303,6 +1303,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
       .clampMaxNumElements(0, s64, 2)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .moreElementsToNextPow2(0)
       .lower();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1c738e352b07b..7d78e9cd7eab6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -408,7 +408,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
     if (Subtarget->ldsRequiresM0Init())
-      return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+      return glueCopyToM0(
+          N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
     MachineFunction &MF = CurDAG->getMachineFunction();
     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
@@ -1724,7 +1725,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   }
 
   VAddr = Addr;
-  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+  Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
   return true;
 }
 
@@ -1832,7 +1833,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
     }
 
     if (SAddr) {
-      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+      Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
       return true;
     }
   }
@@ -1848,7 +1849,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
   VOffset = SDValue(VMov, 0);
-  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+  Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
   return true;
 }
 
@@ -1903,13 +1904,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
     SDValue AddOffset =
         SAddr.getOpcode() == ISD::TargetFrameIndex
             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
-            : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
+            : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
                                            SAddr, AddOffset),
                     0);
   }
 
-  Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);
+  Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
 
   return true;
 }
@@ -2058,7 +2059,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
       *Subtarget, ByteOffset, IsBuffer, HasSOffset);
   if (EncodedOffset && Offset && !Imm32Only) {
-    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+    *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
     return true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3cc4bd92f6471..d77508227b076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2333,7 +2333,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   SDValue RHS = Op.getOperand(1);
 
   SDValue Zero = DAG.getConstant(0, DL, VT);
-  SDValue NegOne = DAG.getConstant(-1, DL, VT);
+  SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
 
   if (VT == MVT::i32) {
     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
@@ -3794,7 +3794,11 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
   if (Width + Offset < 32) {
     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
-    return DAG.getConstant(Result, DL, MVT::i32);
+    if constexpr (std::is_signed_v<IntTy>) {
+      return DAG.getSignedConstant(Result, DL, MVT::i32);
+    } else {
+      return DAG.getConstant(Result, DL, MVT::i32);
+    }
   }
 
   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 087de1bed86f7..18a09c39a0638 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_wavefrontsize: {
+    if (ST->isWaveSizeKnown())
+      return IC.replaceInstUsesWith(
+          II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
+    break;
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 1b88fdd3ab2e1..c2e952418f1be 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -919,7 +919,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
   } else if (CompareVT == MVT::i32) {
-    HWTrue = DAG.getConstant(-1, DL, CompareVT);
+    HWTrue = DAG.getAllOnesConstant(DL, CompareVT);
     HWFalse = DAG.getConstant(0, DL, CompareVT);
   }
   else {
@@ -949,7 +949,7 @@ SDValue R600TargetLowering::lowerADDRSPACECAST(SDValue Op,
   unsigned DestAS = ASC->getDestAddressSpace();
 
   if (isNullConstant(Op.getOperand(0)) && SrcAS == AMDGPUAS::FLAT_ADDRESS)
-    return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT);
+    return DAG.getSignedConstant(TM.getNullPointerValue(DestAS), SL, VT);
 
   return Op;
 }
@@ -1750,11 +1750,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
-                           SelectCC.getOperand(0), // LHS
-                           SelectCC.getOperand(1), // RHS
-                           DAG.getConstant(-1, DL, MVT::i32), // True
-                           DAG.getConstant(0, DL, MVT::i32),  // False
-                           SelectCC.getOperand(4)); // CC
+                       SelectCC.getOperand(0),               // LHS
+                       SelectCC.getOperand(1),               // RHS
+                       DAG.getAllOnesConstant(DL, MVT::i32), // True
+                       DAG.getConstant(0, DL, MVT::i32),     // False
+                       SelectCC.getOperand(4));              // CC
   }
 
   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 73ca59fe320d2..f3b5e6985e8e0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4019,10 +4019,11 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
   Align StackAlign = TFL->getStackAlign();
   Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
   if (Alignment && *Alignment > StackAlign) {
-    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-                       DAG.getConstant(-(uint64_t)Alignment->value()
-                                           << Subtarget->getWavefrontSizeLog2(),
-                                       dl, VT));
+    Tmp1 = DAG.getNode(
+        ISD::AND, dl, VT, Tmp1,
+        DAG.getSignedConstant(-(uint64_t)Alignment->value()
+                                  << Subtarget->getWavefrontSizeLog2(),
+                              dl, VT));
   }
 
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
@@ -6771,10 +6772,10 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
   // TODO: This should be a generic narrowing legalization, and can easily be
   // for GlobalISel.
 
-  SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
+  SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
   SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
 
-  SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
+  SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
   SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
 
   SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
@@ -7542,11 +7543,11 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
 
       SDValue Vec0 = SVN->getOperand(VecIdx0);
       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
-                                 DAG.getConstant(EltIdx0, SL, MVT::i32));
+                                 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
 
       SDValue Vec1 = SVN->getOperand(VecIdx1);
       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
-                                 DAG.getConstant(EltIdx1, SL, MVT::i32));
+                                 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
       Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
     }
   }
@@ -9618,7 +9619,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
     if (ST.hasSplitBarriers()) {
       SDValue K =
-          DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
+          DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
       SDValue BarSignal =
           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
                                      MVT::Other, K, Op.getOperand(0)),
@@ -11173,8 +11174,9 @@ SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
     SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
 
     SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
-    SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
-                                           DAG.getConstant(-1, DL, MVT::i32));
+    SDValue SqrtSNextDownInt =
+        DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
+                    DAG.getAllOnesConstant(DL, MVT::i32));
     SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
 
     SDValue NegSqrtSNextDown =
@@ -11296,7 +11298,7 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
 
-  SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
+  SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
   SDValue ScaleDown =
       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
   SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
@@ -14689,7 +14691,7 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
           (CRHS->isZero() &&
            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
-                           DAG.getConstant(-1, SL, MVT::i1));
+                           DAG.getAllOnesConstant(SL, MVT::i1));
       if ((CRHS->isAllOnes() &&
            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
           (CRHS->isZero() &&
@@ -14715,7 +14717,7 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
           (CT == CRHSVal && CC == ISD::SETNE))
         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
-                           DAG.getConstant(-1, SL, MVT::i1));
+                           DAG.getAllOnesConstant(SL, MVT::i1));
       if ((CF == CRHSVal && CC == ISD::SETNE) ||
           (CT == CRHSVal && CC == ISD::SETEQ))
         return LHS.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84cb1e48772ca..7f77270a93183 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -816,27 +816,29 @@ def as_i8imm : SDNodeXForm<imm, [{
 }]>;
 
 def as_i8timm : SDNodeXForm<timm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
 }]>;
 
 def as_i16imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
 }]>;
 
 def as_i16timm : SDNodeXForm<timm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+  // Explicit cast, as this is used with both signed and unsigned immediates.
+  return CurDAG->getSignedTargetConstant(int16_t(N->getSExtValue()), SDLoc(N),
+                                         MVT::i16);
 }]>;
 
 def as_i32imm: SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
 def as_i32timm: SDNodeXForm<timm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
 }]>;
 
 def as_i64imm: SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
 def cond_as_i32imm: SDNodeXForm<cond, [{
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5bccf101c049e..07ec47524c34d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3748,7 +3748,7 @@ def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
   const auto &APF = N->getValueAPF();
   int Log2 = APF.getExactLog2Abs();
   assert(Log2 != INT_MIN);
-  return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32);
+  return CurDAG->getSignedTargetConstant(Log2, SDLoc(N), MVT::i32);
 }]>;
 
 // Check if a floating point value is a power of 2 floating-point
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6eae756b25fb5..c0021f69f18e7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index b01af468d9ea2..2924083ece344 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -285,7 +285,8 @@ def : Pat<(riscv_fclass FPR64:$rs1), (FCLASS_D $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D, FPR64, f64>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_D, FPR64, f64>;
-def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
+def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)),
+          (FSGNJN_D FPR64:$rs1, FPR64:$rs2)>;
 def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2,
                                                               FRM_RNE))>;
 def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2,
@@ -323,7 +324,7 @@ def : Pat<(riscv_fclass FPR64INX:$rs1), (FCLASS_D_INX $rs1)>;
 def : PatFprFpr<fcopysign, FSGNJ_D_INX, FPR64INX, f64>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_INX, FPR64INX, f64>;
 def : Pat<(fcopysign FPR64INX:$rs1, (fneg FPR64INX:$rs2)),
-          (FSGNJN_D_INX $rs1, $rs2)>;
+          (FSGNJN_D_INX FPR64INX:$rs1, FPR64INX:$rs2)>;
 def : Pat<(fcopysign FPR64INX:$rs1, FPR32INX:$rs2),
           (FSGNJ_D_INX $rs1, (f64 (FCVT_D_S_INX $rs2, FRM_RNE)))>;
 def : Pat<(fcopysign FPR32INX:$rs1, FPR64INX:$rs2),
@@ -361,7 +362,7 @@ def : Pat<(riscv_fclass FPR64IN32X:$rs1), (FCLASS_D_IN32X $rs1)>;
 def : PatFprFpr<fcopysign, FSGNJ_D_IN32X, FPR64IN32X, f64>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_IN32X, FPR64IN32X, f64>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, (fneg FPR64IN32X:$rs2)),
-          (FSGNJN_D_IN32X $rs1, $rs2)>;
+          (FSGNJN_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2)>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, FPR32INX:$rs2),
           (FSGNJ_D_IN32X $rs1, (FCVT_D_S_IN32X $rs2, FRM_RNE))>;
 def : Pat<(fcopysign FPR32INX:$rs1, FPR64IN32X:$rs2),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 2c27e3950f07f..6c41c53bb301f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -570,7 +570,8 @@ defm : PatFprFpr_m<riscv_fsgnjx, FSGNJX_S, Ext>;
 }
 
 let Predicates = [HasStdExtF] in {
-def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
+def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)),
+          (FSGNJN_S FPR32:$rs1, FPR32:$rs2)>;
 
 // fmadd: rs1 * rs2 + rs3
 def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3),
@@ -594,7 +595,8 @@ def : Pat<(fneg (any_fma_nsz FPR32:$rs1, FPR32:$rs2, FPR32:$rs3)),
 } // Predicates = [HasStdExtF]
 
 let Predicates = [HasStdExtZfinx] in {
-def : Pat<(fcopysign FPR32INX:$rs1, (fneg FPR32INX:$rs2)), (FSGNJN_S_INX $rs1, $rs2)>;
+def : Pat<(fcopysign FPR32INX:$rs1, (fneg FPR32INX:$rs2)),
+          (FSGNJN_S_INX FPR32INX:$rs1, FPR32INX:$rs2)>;
 
 // fmadd: rs1 * rs2 + rs3
 def : Pat<(any_fma FPR32INX:$rs1, FPR32INX:$rs2, FPR32INX:$rs3),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index e2e99cc3f2b72..625011c3b9f7c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -291,7 +291,8 @@ def : Pat<(riscv_fclass (f16 FPR16:$rs1)), (FCLASS_H $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H, FPR16, f16>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_H, FPR16, f16>;
-def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))), (FSGNJN_H $rs1, $rs2)>;
+def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))),
+          (FSGNJN_H FPR16:$rs1, FPR16:$rs2)>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, FPR32:$rs2)),
           (FSGNJ_H $rs1, (f16 (FCVT_H_S $rs2, FRM_DYN)))>;
 
@@ -334,7 +335,8 @@ def : Pat<(riscv_fclass FPR16INX:$rs1), (FCLASS_H_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H_INX, FPR16INX, f16>;
 def : PatFprFpr<riscv_fsgnjx, FSGNJX_H_INX, FPR16INX, f16>;
-def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)), (FSGNJN_H_INX $rs1, $rs2)>;
+def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)),
+          (FSGNJN_H_INX FPR16INX:$rs1, FPR16INX:$rs2)>;
 def : Pat<(fcopysign FPR16INX:$rs1, FPR32INX:$rs2),
           (FSGNJ_H_INX $rs1, (FCVT_H_S_INX $rs2, FRM_DYN))>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 45576d515112d..017264fbd46e0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1026,7 +1026,9 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (ST->hasVInstructions() && LT.second.isVector()) {
       // vrsub.vi v10, v8, 0
       // vmax.vv v8, v8, v10
-      return LT.first * 2;
+      return LT.first *
+             getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
+                                     LT.second, CostKind);
     }
     break;
   }
@@ -1157,6 +1159,16 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
                               ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
                               CostKind);
+  case Intrinsic::experimental_vp_splat: {
+    auto LT = getTypeLegalizationCost(RetTy);
+    // TODO: Lower i1 experimental_vp_splat
+    if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
+      return InstructionCost::getInvalid();
+    return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
+                                                  ? RISCV::VFMV_V_F
+                                                  : RISCV::VMV_V_X,
+                                              LT.second, CostKind);
+  }
   case Intrinsic::vp_reduce_add:
   case Intrinsic::vp_reduce_fadd:
   case Intrinsic::vp_reduce_mul:
@@ -2330,20 +2342,6 @@ bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
   return true;
 }
 
-bool RISCVTTIImpl::areInlineCompatible(const Function *Caller,
-                                       const Function *Callee) const {
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  // Inline a callee if its target-features are a subset of the callers
-  // target-features.
-  return (CallerBits & CalleeBits) == CalleeBits;
-}
-
 /// See if \p I should be considered for address type promotion. We check if \p
 /// I is a sext with right type and used in memory accesses. If it used in a
 /// "complex" getelementptr, we allow it to be promoted without finding other
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 498f48353dc0c..6fd36e90a02dd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -60,9 +60,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
-  bool areInlineCompatible(const Function *Caller,
-                           const Function *Callee) const;
-
   /// Return the cost of materializing an immediate for a value operand of
   /// a store instruction.
   InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo,
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 90d7bd934af40..403d238aa5b52 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -671,7 +671,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
   }
 
   // Lower the displacement to a TargetConstant.
-  Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(Base), VT);
+  Disp = CurDAG->getSignedTargetConstant(AM.Disp, SDLoc(Base), VT);
 }
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
@@ -2024,8 +2024,9 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
                              CurDAG->getConstant(IPM.XORValue, DL, MVT::i32));
 
   if (IPM.AddValue)
-    Result = CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result,
-                             CurDAG->getConstant(IPM.AddValue, DL, MVT::i32));
+    Result =
+        CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result,
+                        CurDAG->getSignedConstant(IPM.AddValue, DL, MVT::i32));
 
   EVT VT = Node->getValueType(0);
   if (VT == MVT::i32 && IPM.Bit == 31) {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 78d91299a357d..8f505b7e198cf 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1444,15 +1444,15 @@ void SystemZTargetLowering::LowerAsmOperandForConstraint(
     case 'K': // Signed 16-bit constant
       if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isInt<16>(C->getSExtValue()))
-          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
-                                              Op.getValueType()));
+          Ops.push_back(DAG.getSignedTargetConstant(
+              C->getSExtValue(), SDLoc(Op), Op.getValueType()));
       return;
 
     case 'L': // Signed 20-bit displacement (on all targets we support)
       if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isInt<20>(C->getSExtValue()))
-          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
-                                              Op.getValueType()));
+          Ops.push_back(DAG.getSignedTargetConstant(
+              C->getSExtValue(), SDLoc(Op), Op.getValueType()));
       return;
 
     case 'M': // 0x7fffffff
@@ -2578,7 +2578,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
   // Make sure that the second operand is an i32 with the right value.
   if (C.Op1.getValueType() != MVT::i32 ||
       Value != ConstOp1->getZExtValue())
-    C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
+    C.Op1 = DAG.getConstant((uint32_t)Value, DL, MVT::i32);
 }
 
 // Return true if Op is either an unextended load, or a load suitable
@@ -3410,7 +3410,7 @@ SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
   }
   if (Invert) {
     SDValue Mask =
-      DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
+        DAG.getSplatBuildVector(VT, DL, DAG.getAllOnesConstant(DL, MVT::i64));
     Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
   }
   if (Chain && Chain.getNode() != Cmp.getNode()) {
@@ -3571,7 +3571,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   // addition for it.
   if (Offset != 0)
     Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
-                         DAG.getConstant(Offset, DL, PtrVT));
+                         DAG.getSignedConstant(Offset, DL, PtrVT));
 
   return Result;
 }
@@ -3834,7 +3834,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
     const auto *TFL = Subtarget.getFrameLowering<SystemZFrameLowering>();
     int Offset = TFL->getReturnAddressOffset(MF);
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, FrameAddr,
-                              DAG.getConstant(Offset, DL, PtrVT));
+                              DAG.getSignedConstant(Offset, DL, PtrVT));
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr,
                        MachinePointerInfo());
   }
@@ -4584,7 +4584,7 @@ static void getCSAddressAndShifts(SDValue Addr, SelectionDAG &DAG, SDLoc DL,
 
   // Get the address of the containing word.
   AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
-                            DAG.getConstant(-4, DL, PtrVT));
+                            DAG.getSignedConstant(-4, DL, PtrVT));
 
   // Get the number of bits that the word must be rotated left in order
   // to bring the field to the top bits of a GR32.
@@ -4623,7 +4623,8 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
   if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
     if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
       Opcode = SystemZISD::ATOMIC_LOADW_ADD;
-      Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
+      Src2 = DAG.getSignedConstant(-Const->getSExtValue(), DL,
+                                   Src2.getValueType());
     }
 
   SDValue AlignedAddr, BitShift, NegBitShift;
diff --git a/llvm/lib/Target/SystemZ/SystemZOperands.td b/llvm/lib/Target/SystemZ/SystemZOperands.td
index 0221e2c53f2f4..64345ca3a1394 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperands.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -220,8 +220,8 @@ def NEGLF32 : SDNodeXForm<imm, [{
 
 // Truncate an immediate to a 8-bit signed quantity.
 def SIMM8 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), SDLoc(N),
-                                   MVT::i64);
+  return CurDAG->getSignedTargetConstant(int8_t(N->getZExtValue()), SDLoc(N),
+                                         MVT::i64);
 }]>;
 
 // Truncate an immediate to a 8-bit unsigned quantity.
@@ -244,14 +244,14 @@ def UIMM12 : SDNodeXForm<imm, [{
 
 // Truncate an immediate to a 16-bit signed quantity.
 def SIMM16 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), SDLoc(N),
-                                   MVT::i64);
+  return CurDAG->getSignedTargetConstant(int16_t(N->getZExtValue()), SDLoc(N),
+                                         MVT::i64);
 }]>;
 
 // Negate and then truncate an immediate to a 16-bit signed quantity.
 def NEGSIMM16 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N),
-                                   MVT::i64);
+  return CurDAG->getSignedTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N),
+                                         MVT::i64);
 }]>;
 
 // Truncate an immediate to a 16-bit unsigned quantity.
@@ -268,8 +268,8 @@ def SIMM32 : SDNodeXForm<imm, [{
 
 // Negate and then truncate an immediate to a 32-bit unsigned quantity.
 def NEGSIMM32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N),
-                                   MVT::i64);
+  return CurDAG->getSignedTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N),
+                                         MVT::i64);
 }]>;
 
 // Truncate an immediate to a 32-bit unsigned quantity.
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 4eb58e27f7ad7..c182c9890509f 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -53,7 +53,7 @@ static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
   int64_t Adj = getMemMemLenAdj(Op);
   SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
                                DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                               DAG.getConstant(0 - Adj, DL, MVT::i64));
+                               DAG.getSignedConstant(0 - Adj, DL, MVT::i64));
   return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2d00889407ff4..a52af6832d583 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index f96e3232b93f4..3d678e5384166 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -104,24 +104,6 @@ TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
   return TTI::ReductionShuffle::SplitHalf;
 }
 
-bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
-                                             const Function *Callee) const {
-  // Allow inlining only when the Callee has a subset of the Caller's
-  // features. In principle, we should be able to inline regardless of any
-  // features because WebAssembly supports features at module granularity, not
-  // function granularity, but without this restriction it would be possible for
-  // a module to "forget" about features if all the functions that used them
-  // were inlined.
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  return (CallerBits & CalleeBits) == CalleeBits;
-}
-
 void WebAssemblyTTIImpl::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
     OptimizationRemarkEmitter *ORE) const {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2ce6cbf3ba026..9691120b2e531 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -72,9 +72,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   TTI::ReductionShuffle
   getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
 
-  bool areInlineCompatible(const Function *Caller,
-                           const Function *Callee) const;
-
   bool supportsTailCalls() const;
 
   bool isProfitableToSinkOperands(Instruction *I,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9048d1d83f187..d3b569eccc17b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
 
   // Assume target opcodes can't be scalarized.
   // TODO - do we have any exceptions?
-  if (Opc >= ISD::BUILTIN_OP_END)
+  if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
     return false;
 
   // If the vector op is not supported, try to convert to scalar.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index e2eae7fb8327c..b4033fc2a418a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1185,14 +1185,27 @@ static Value *extractIntPart(const IntPart &P, IRBuilderBase &Builder) {
 /// (icmp eq X0, Y0) & (icmp eq X1, Y1) -> icmp eq X01, Y01
 /// (icmp ne X0, Y0) | (icmp ne X1, Y1) -> icmp ne X01, Y01
 /// where X0, X1 and Y0, Y1 are adjacent parts extracted from an integer.
-Value *InstCombinerImpl::foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                       bool IsAnd) {
+Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) {
   if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
     return nullptr;
 
   CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
-  auto GetMatchPart = [&](ICmpInst *Cmp,
+  auto GetMatchPart = [&](Value *CmpV,
                           unsigned OpNo) -> std::optional<IntPart> {
+    assert(CmpV->getType()->isIntOrIntVectorTy(1) && "Must be bool");
+
+    Value *X, *Y;
+    // icmp ne (and x, 1), (and y, 1) <=> trunc (xor x, y) to i1
+    // icmp eq (and x, 1), (and y, 1) <=> not (trunc (xor x, y) to i1)
+    if (Pred == CmpInst::ICMP_NE
+            ? match(CmpV, m_Trunc(m_Xor(m_Value(X), m_Value(Y))))
+            : match(CmpV, m_Not(m_Trunc(m_Xor(m_Value(X), m_Value(Y))))))
+      return {{OpNo == 0 ? X : Y, 0, 1}};
+
+    auto *Cmp = dyn_cast<ICmpInst>(CmpV);
+    if (!Cmp)
+      return std::nullopt;
+
     if (Pred == Cmp->getPredicate())
       return matchIntPart(Cmp->getOperand(OpNo));
 
@@ -1465,11 +1478,15 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
 
     // FCmp canonicalization ensures that (fcmp ord/uno X, X) and
     // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
-    if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
+    if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP())) {
       // Ignore the constants because they are obviously not NANs:
       // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y)
       // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y)
+      IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+      Builder.setFastMathFlags(LHS->getFastMathFlags() &
+                               RHS->getFastMathFlags());
       return Builder.CreateFCmp(PredL, LHS0, RHS0);
+    }
   }
 
   if (IsAnd && stripSignOnlyFPOps(LHS0) == stripSignOnlyFPOps(RHS0)) {
@@ -2728,47 +2745,31 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
           foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false))
     return replaceInstUsesWith(I, Res);
 
-  {
-    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
-    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
-
-    // TODO: Base this on foldBooleanAndOr instead?
-    // TODO: Make this recursive; it's a little tricky because an arbitrary
-    // number of 'and' instructions might have to be created.
-    if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
-      bool IsLogical = isa<SelectInst>(Op1);
-      // LHS & (X && Y) --> (LHS && X) && Y
-      if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res =
-                foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true, IsLogical))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalAnd(Res, Y)
-                                            : Builder.CreateAnd(Res, Y));
-      // LHS & (X && Y) --> X && (LHS & Y)
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true,
-                                          /* IsLogical */ false))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalAnd(X, Res)
-                                            : Builder.CreateAnd(X, Res));
-    }
-    if (RHS && match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
-      bool IsLogical = isa<SelectInst>(Op0);
-      // (X && Y) & RHS --> (X && RHS) && Y
-      if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res =
-                foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true, IsLogical))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalAnd(Res, Y)
-                                            : Builder.CreateAnd(Res, Y));
-      // (X && Y) & RHS --> X && (Y & RHS)
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true,
-                                          /* IsLogical */ false))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalAnd(X, Res)
-                                            : Builder.CreateAnd(X, Res));
-    }
+  // TODO: Make this recursive; it's a little tricky because an arbitrary
+  // number of 'and' instructions might have to be created.
+  if (match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
+    bool IsLogical = isa<SelectInst>(Op1);
+    // Op0 & (X && Y) --> (Op0 && X) && Y
+    if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ true, IsLogical))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y)
+                                              : Builder.CreateAnd(Res, Y));
+    // Op0 & (X && Y) --> X && (Op0 & Y)
+    if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ true,
+                                      /* IsLogical */ false))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res)
+                                              : Builder.CreateAnd(X, Res));
+  }
+  if (match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
+    bool IsLogical = isa<SelectInst>(Op0);
+    // (X && Y) & Op1 --> (X && Op1) && Y
+    if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ true, IsLogical))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y)
+                                              : Builder.CreateAnd(Res, Y));
+    // (X && Y) & Op1 --> X && (Y & Op1)
+    if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ true,
+                                      /* IsLogical */ false))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res)
+                                              : Builder.CreateAnd(X, Res));
   }
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
@@ -3416,9 +3417,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
       return X;
   }
 
-  if (Value *X = foldEqOfParts(LHS, RHS, IsAnd))
-    return X;
-
   // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
   // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
   // TODO: Remove this and below when foldLogOpOfMaskedICmps can handle undefs.
@@ -3541,6 +3539,9 @@ Value *InstCombinerImpl::foldBooleanAndOr(Value *LHS, Value *RHS,
       if (Value *Res = foldLogicOfFCmps(LHSCmp, RHSCmp, IsAnd, IsLogical))
         return Res;
 
+  if (Value *Res = foldEqOfParts(LHS, RHS, IsAnd))
+    return Res;
+
   return nullptr;
 }
 
@@ -3829,48 +3830,31 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
           foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false))
     return replaceInstUsesWith(I, Res);
 
-  {
-    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
-    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
-
-    // TODO: Base this on foldBooleanAndOr instead?
-    // TODO: Make this recursive; it's a little tricky because an arbitrary
-    // number of 'or' instructions might have to be created.
-    Value *X, *Y;
-    if (LHS && match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
-      bool IsLogical = isa<SelectInst>(Op1);
-      // LHS | (X || Y) --> (LHS || X) || Y
-      if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res =
-                foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false, IsLogical))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalOr(Res, Y)
-                                            : Builder.CreateOr(Res, Y));
-      // LHS | (X || Y) --> X || (LHS | Y)
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false,
-                                          /* IsLogical */ false))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalOr(X, Res)
-                                            : Builder.CreateOr(X, Res));
-    }
-    if (RHS && match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
-      bool IsLogical = isa<SelectInst>(Op0);
-      // (X || Y) | RHS --> (X || RHS) || Y
-      if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res =
-                foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false, IsLogical))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalOr(Res, Y)
-                                            : Builder.CreateOr(Res, Y));
-      // (X || Y) | RHS --> X || (Y | RHS)
-      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false,
-                                          /* IsLogical */ false))
-          return replaceInstUsesWith(I, IsLogical
-                                            ? Builder.CreateLogicalOr(X, Res)
-                                            : Builder.CreateOr(X, Res));
-    }
+  // TODO: Make this recursive; it's a little tricky because an arbitrary
+  // number of 'or' instructions might have to be created.
+  if (match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
+    bool IsLogical = isa<SelectInst>(Op1);
+    // Op0 | (X || Y) --> (Op0 || X) || Y
+    if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ false, IsLogical))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y)
+                                              : Builder.CreateOr(Res, Y));
+    // Op0 | (X || Y) --> X || (Op0 | Y)
+    if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ false,
+                                      /* IsLogical */ false))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res)
+                                              : Builder.CreateOr(X, Res));
+  }
+  if (match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
+    bool IsLogical = isa<SelectInst>(Op0);
+    // (X || Y) | Op1 --> (X || Op1) || Y
+    if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ false, IsLogical))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y)
+                                              : Builder.CreateOr(Res, Y));
+    // (X || Y) | Op1 --> X || (Y | Op1)
+    if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ false,
+                                      /* IsLogical */ false))
+      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res)
+                                              : Builder.CreateOr(X, Res));
   }
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9588930d7658c..0508ed48fc19c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -412,7 +412,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                           bool IsAnd, bool IsLogical = false);
   Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
 
-  Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd);
+  Value *foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd);
 
   Value *foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, ICmpInst *ICmp2,
                                      bool IsAnd);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 1991ec82d1e1e..b664bde5d320a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1662,21 +1662,43 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
 /// \endcode
 ///
 /// So we need to turn hoisted load/store into cload/cstore.
+///
+/// \param BI The branch instruction.
+/// \param SpeculatedConditionalLoadsStores The load/store instructions that
+///                                         will be speculated.
+/// \param Invert indicates if speculates FalseBB. Only used in triangle CFG.
 static void hoistConditionalLoadsStores(
     BranchInst *BI,
     SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
-    bool Invert) {
+    std::optional<bool> Invert) {
   auto &Context = BI->getParent()->getContext();
   auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
   auto *Cond = BI->getOperand(0);
   // Construct the condition if needed.
   BasicBlock *BB = BI->getParent();
-  IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
-  Value *Mask = Builder.CreateBitCast(
-      Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
-      VCondTy);
+  IRBuilder<> Builder(
+      Invert.has_value() ? SpeculatedConditionalLoadsStores.back() : BI);
+  Value *Mask = nullptr;
+  Value *MaskFalse = nullptr;
+  Value *MaskTrue = nullptr;
+  if (Invert.has_value()) {
+    Mask = Builder.CreateBitCast(
+        *Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
+        VCondTy);
+  } else {
+    MaskFalse = Builder.CreateBitCast(
+        Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+    MaskTrue = Builder.CreateBitCast(Cond, VCondTy);
+  }
+  auto PeekThroughBitcasts = [](Value *V) {
+    while (auto *BitCast = dyn_cast<BitCastInst>(V))
+      V = BitCast->getOperand(0);
+    return V;
+  };
   for (auto *I : SpeculatedConditionalLoadsStores) {
-    IRBuilder<> Builder(I);
+    IRBuilder<> Builder(Invert.has_value() ? I : BI);
+    if (!Invert.has_value())
+      Mask = I->getParent() == BI->getSuccessor(0) ? MaskTrue : MaskFalse;
     // We currently assume conditional faulting load/store is supported for
     // scalar types only when creating new instructions. This can be easily
     // extended for vector types in the future.
@@ -1688,12 +1710,14 @@ static void hoistConditionalLoadsStores(
       auto *Ty = I->getType();
       PHINode *PN = nullptr;
       Value *PassThru = nullptr;
-      for (User *U : I->users())
-        if ((PN = dyn_cast<PHINode>(U))) {
-          PassThru = Builder.CreateBitCast(PN->getIncomingValueForBlock(BB),
-                                           FixedVectorType::get(Ty, 1));
-          break;
-        }
+      if (Invert.has_value())
+        for (User *U : I->users())
+          if ((PN = dyn_cast<PHINode>(U))) {
+            PassThru = Builder.CreateBitCast(
+                PeekThroughBitcasts(PN->getIncomingValueForBlock(BB)),
+                FixedVectorType::get(Ty, 1));
+            break;
+          }
       MaskedLoadStore = Builder.CreateMaskedLoad(
           FixedVectorType::get(Ty, 1), Op0, LI->getAlign(), Mask, PassThru);
       Value *NewLoadStore = Builder.CreateBitCast(MaskedLoadStore, Ty);
@@ -1702,8 +1726,8 @@ static void hoistConditionalLoadsStores(
       I->replaceAllUsesWith(NewLoadStore);
     } else {
       // Handle Store.
-      auto *StoredVal =
-          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      auto *StoredVal = Builder.CreateBitCast(
+          PeekThroughBitcasts(Op0), FixedVectorType::get(Op0->getType(), 1));
       MaskedLoadStore = Builder.CreateMaskedStore(
           StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
     }
@@ -3155,7 +3179,8 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
-static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
+static bool isProfitableToSpeculate(const BranchInst *BI,
+                                    std::optional<bool> Invert,
                                     const TargetTransformInfo &TTI) {
   // If the branch is non-unpredictable, and is predicted to *not* branch to
   // the `then` block, then avoid speculating it.
@@ -3166,7 +3191,10 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
   if (!extractBranchWeights(*BI, TWeight, FWeight) || (TWeight + FWeight) == 0)
     return true;
 
-  uint64_t EndWeight = Invert ? TWeight : FWeight;
+  if (!Invert.has_value())
+    return false;
+
+  uint64_t EndWeight = *Invert ? TWeight : FWeight;
   BranchProbability BIEndProb =
       BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
   BranchProbability Likely = TTI.getPredictableBranchThreshold();
@@ -8034,6 +8062,35 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (HoistCommon &&
           hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts))
         return requestResimplify();
+
+      if (BI && HoistLoadsStoresWithCondFaulting &&
+          Options.HoistLoadsStoresWithCondFaulting &&
+          isProfitableToSpeculate(BI, std::nullopt, TTI)) {
+        SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
+        auto CanSpeculateConditionalLoadsStores = [&]() {
+          for (auto *Succ : successors(BB)) {
+            for (Instruction &I : *Succ) {
+              if (I.isTerminator()) {
+                if (I.getNumSuccessors() > 1)
+                  return false;
+                continue;
+              } else if (!isSafeCheapLoadStore(&I, TTI) ||
+                         SpeculatedConditionalLoadsStores.size() ==
+                             HoistLoadsStoresWithCondFaultingThreshold) {
+                return false;
+              }
+              SpeculatedConditionalLoadsStores.push_back(&I);
+            }
+          }
+          return !SpeculatedConditionalLoadsStores.empty();
+        };
+
+        if (CanSpeculateConditionalLoadsStores()) {
+          hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores,
+                                      std::nullopt);
+          return requestResimplify();
+        }
+      }
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/Analysis/CostModel/RISCV/abs.ll b/llvm/test/Analysis/CostModel/RISCV/abs.ll
index 8f0dd7b0aefe9..7252716af8605 100644
--- a/llvm/test/Analysis/CostModel/RISCV/abs.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/abs.ll
@@ -44,37 +44,37 @@ declare <vscale x 64 x i8> @llvm.abs.nxv64i8(<vscale x 64 x i8>, i1)
 define i32 @abs(i32 %arg) {
 ; CHECK-LABEL: 'abs'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %5 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %6 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %10 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <vscale x 8 x i32> @llvm.abs.nxv8i32(<vscale x 8 x i32> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <vscale x 16 x i32> @llvm.abs.nxv16i32(<vscale x 16 x i32> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = call <vscale x 8 x i32> @llvm.abs.nxv8i32(<vscale x 8 x i32> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = call <vscale x 16 x i32> @llvm.abs.nxv16i32(<vscale x 16 x i32> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 2 x i16> @llvm.abs.nxv2i16(<vscale x 2 x i16> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i16> @llvm.abs.nxv16i16(<vscale x 16 x i16> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 32 x i16> @llvm.abs.nxv32i16(<vscale x 32 x i16> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %23 = call <vscale x 16 x i16> @llvm.abs.nxv16i16(<vscale x 16 x i16> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %24 = call <vscale x 32 x i16> @llvm.abs.nxv32i16(<vscale x 32 x i16> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = call <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = call <vscale x 64 x i8> @llvm.abs.nxv64i8(<vscale x 64 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %31 = call <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %32 = call <vscale x 64 x i8> @llvm.abs.nxv64i8(<vscale x 64 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 800ea223850d3..c7cd845a0a03f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -1147,28 +1147,28 @@ define void @abs() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'abs'
@@ -1182,28 +1182,28 @@ define void @abs() {
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef)
@@ -2125,6 +2125,232 @@ define void @vp_fdiv(){
   ret void
 }
 
+define void @splat() {
+; CHECK-LABEL: 'splat'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %1 = call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %2 = call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %3 = call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %4 = call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %37 = call <vscale x 2 x i1> @llvm.experimental.vp.splat.nxv2i1(i1 undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %38 = call <vscale x 4 x i1> @llvm.experimental.vp.splat.nxv4i1(i1 undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %39 = call <vscale x 8 x i1> @llvm.experimental.vp.splat.nxv8i1(i1 undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %40 = call <vscale x 16 x i1> @llvm.experimental.vp.splat.nxv16i1(i1 undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = call <vscale x 2 x i8> @llvm.experimental.vp.splat.nxv2i8(i8 undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = call <vscale x 4 x i8> @llvm.experimental.vp.splat.nxv4i8(i8 undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = call <vscale x 8 x i8> @llvm.experimental.vp.splat.nxv8i8(i8 undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = call <vscale x 16 x i8> @llvm.experimental.vp.splat.nxv16i8(i8 undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = call <vscale x 2 x i16> @llvm.experimental.vp.splat.nxv2i16(i16 undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <vscale x 4 x i16> @llvm.experimental.vp.splat.nxv4i16(i16 undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <vscale x 8 x i16> @llvm.experimental.vp.splat.nxv8i16(i16 undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <vscale x 16 x i16> @llvm.experimental.vp.splat.nxv16i16(i16 undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = call <vscale x 2 x i32> @llvm.experimental.vp.splat.nxv2i32(i32 undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %50 = call <vscale x 4 x i32> @llvm.experimental.vp.splat.nxv4i32(i32 undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %51 = call <vscale x 8 x i32> @llvm.experimental.vp.splat.nxv8i32(i32 undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %52 = call <vscale x 16 x i32> @llvm.experimental.vp.splat.nxv16i32(i32 undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %53 = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %54 = call <vscale x 4 x i64> @llvm.experimental.vp.splat.nxv4i64(i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %55 = call <vscale x 8 x i64> @llvm.experimental.vp.splat.nxv8i64(i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %56 = call <vscale x 16 x i64> @llvm.experimental.vp.splat.nxv16i64(i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %60 = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %63 = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %64 = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %65 = call <vscale x 2 x float> @llvm.experimental.vp.splat.nxv2f32(float undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %66 = call <vscale x 4 x float> @llvm.experimental.vp.splat.nxv4f32(float undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %67 = call <vscale x 8 x float> @llvm.experimental.vp.splat.nxv8f32(float undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %68 = call <vscale x 16 x float> @llvm.experimental.vp.splat.nxv16f32(float undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %69 = call <vscale x 2 x double> @llvm.experimental.vp.splat.nxv2f64(double undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %70 = call <vscale x 4 x double> @llvm.experimental.vp.splat.nxv4f64(double undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %71 = call <vscale x 8 x double> @llvm.experimental.vp.splat.nxv8f64(double undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = call <vscale x 16 x double> @llvm.experimental.vp.splat.nxv16f64(double undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'splat'
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %1 = call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %2 = call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %3 = call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %4 = call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %37 = call <vscale x 2 x i1> @llvm.experimental.vp.splat.nxv2i1(i1 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %38 = call <vscale x 4 x i1> @llvm.experimental.vp.splat.nxv4i1(i1 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %39 = call <vscale x 8 x i1> @llvm.experimental.vp.splat.nxv8i1(i1 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %40 = call <vscale x 16 x i1> @llvm.experimental.vp.splat.nxv16i1(i1 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = call <vscale x 2 x i8> @llvm.experimental.vp.splat.nxv2i8(i8 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = call <vscale x 4 x i8> @llvm.experimental.vp.splat.nxv4i8(i8 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = call <vscale x 8 x i8> @llvm.experimental.vp.splat.nxv8i8(i8 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = call <vscale x 16 x i8> @llvm.experimental.vp.splat.nxv16i8(i8 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = call <vscale x 2 x i16> @llvm.experimental.vp.splat.nxv2i16(i16 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <vscale x 4 x i16> @llvm.experimental.vp.splat.nxv4i16(i16 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <vscale x 8 x i16> @llvm.experimental.vp.splat.nxv8i16(i16 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <vscale x 16 x i16> @llvm.experimental.vp.splat.nxv16i16(i16 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = call <vscale x 2 x i32> @llvm.experimental.vp.splat.nxv2i32(i32 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %50 = call <vscale x 4 x i32> @llvm.experimental.vp.splat.nxv4i32(i32 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %51 = call <vscale x 8 x i32> @llvm.experimental.vp.splat.nxv8i32(i32 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %52 = call <vscale x 16 x i32> @llvm.experimental.vp.splat.nxv16i32(i32 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %53 = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %54 = call <vscale x 4 x i64> @llvm.experimental.vp.splat.nxv4i64(i64 undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %55 = call <vscale x 8 x i64> @llvm.experimental.vp.splat.nxv8i64(i64 undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %56 = call <vscale x 16 x i64> @llvm.experimental.vp.splat.nxv16i64(i64 undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %60 = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %63 = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %64 = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %65 = call <vscale x 2 x float> @llvm.experimental.vp.splat.nxv2f32(float undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %66 = call <vscale x 4 x float> @llvm.experimental.vp.splat.nxv4f32(float undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %67 = call <vscale x 8 x float> @llvm.experimental.vp.splat.nxv8f32(float undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %68 = call <vscale x 16 x float> @llvm.experimental.vp.splat.nxv16f32(float undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %69 = call <vscale x 2 x double> @llvm.experimental.vp.splat.nxv2f64(double undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %70 = call <vscale x 4 x double> @llvm.experimental.vp.splat.nxv4f64(double undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %71 = call <vscale x 8 x double> @llvm.experimental.vp.splat.nxv8f64(double undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = call <vscale x 16 x double> @llvm.experimental.vp.splat.nxv16f64(double undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef)
+  call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef)
+  call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef)
+  call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef)
+  call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef)
+  call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef)
+  call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef)
+  call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef)
+  call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef)
+  call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef)
+  call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef)
+  call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef)
+  call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef)
+  call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef)
+  call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef)
+  call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef)
+  call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef)
+  call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef)
+  call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef)
+  call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef)
+  call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef)
+  call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef)
+  call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef)
+  call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef)
+  call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef)
+  call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef)
+  call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef)
+  call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef)
+  call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef)
+  call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef)
+  call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef)
+  call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef)
+  call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef)
+  call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef)
+  call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef)
+  call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef)
+  call <vscale x 2 x i1> @llvm.experimental.vp.splat.nxv2i1(i1 undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i1> @llvm.experimental.vp.splat.nxv4i1(i1 undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i1> @llvm.experimental.vp.splat.nxv8i1(i1 undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i1> @llvm.experimental.vp.splat.nxv16i1(i1 undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x i8> @llvm.experimental.vp.splat.nxv2i8(i8 undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i8> @llvm.experimental.vp.splat.nxv4i8(i8 undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i8> @llvm.experimental.vp.splat.nxv8i8(i8 undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i8> @llvm.experimental.vp.splat.nxv16i8(i8 undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x i16> @llvm.experimental.vp.splat.nxv2i16(i16 undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i16> @llvm.experimental.vp.splat.nxv4i16(i16 undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i16> @llvm.experimental.vp.splat.nxv8i16(i16 undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i16> @llvm.experimental.vp.splat.nxv16i16(i16 undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x i32> @llvm.experimental.vp.splat.nxv2i32(i32 undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i32> @llvm.experimental.vp.splat.nxv4i32(i32 undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i32> @llvm.experimental.vp.splat.nxv8i32(i32 undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i32> @llvm.experimental.vp.splat.nxv16i32(i32 undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i64> @llvm.experimental.vp.splat.nxv4i64(i64 undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i64> @llvm.experimental.vp.splat.nxv8i64(i64 undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i64> @llvm.experimental.vp.splat.nxv16i64(i64 undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x float> @llvm.experimental.vp.splat.nxv2f32(float undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x float> @llvm.experimental.vp.splat.nxv4f32(float undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x float> @llvm.experimental.vp.splat.nxv8f32(float undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x float> @llvm.experimental.vp.splat.nxv16f32(float undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 2 x double> @llvm.experimental.vp.splat.nxv2f64(double undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x double> @llvm.experimental.vp.splat.nxv4f64(double undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x double> @llvm.experimental.vp.splat.nxv8f64(double undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x double> @llvm.experimental.vp.splat.nxv16f64(double undef, <vscale x 16 x i1> undef, i32 undef)
+  ret void
+}
+
 declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
 declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
 declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 7af7c235f9ac1..2f543cc324bc2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1,12 +1,114 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for sqshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshr1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshr_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshr1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshr_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu2d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_i64_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_i32_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_ushl_vscalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_ushl_scalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_vscalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift_m1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ursra1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ursra_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srsra1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srsra_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli2d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_zero_shift_amount
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lshr_trunc_v2i64_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ashr_trunc_v2i64_v2i8
 
 define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: sqshl8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.8b v0, v0, v1
+; CHECK-NEXT:    sqshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -19,7 +121,7 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.4h v0, v0, v1
+; CHECK-NEXT:    sqshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -32,7 +134,7 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.2s v0, v0, v1
+; CHECK-NEXT:    sqshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -97,7 +199,7 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.8b v0, v0, v1
+; CHECK-NEXT:    uqshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -110,7 +212,7 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.4h v0, v0, v1
+; CHECK-NEXT:    uqshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -123,7 +225,7 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.2s v0, v0, v1
+; CHECK-NEXT:    uqshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -136,7 +238,7 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.16b v0, v0, v1
+; CHECK-NEXT:    sqshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -149,7 +251,7 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.8h v0, v0, v1
+; CHECK-NEXT:    sqshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -162,7 +264,7 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.4s v0, v0, v1
+; CHECK-NEXT:    sqshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -175,7 +277,7 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.2d v0, v0, v1
+; CHECK-NEXT:    sqshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -188,7 +290,7 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.16b v0, v0, v1
+; CHECK-NEXT:    uqshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -201,7 +303,7 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.8h v0, v0, v1
+; CHECK-NEXT:    uqshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -214,7 +316,7 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.4s v0, v0, v1
+; CHECK-NEXT:    uqshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -227,7 +329,7 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.2d v0, v0, v1
+; CHECK-NEXT:    uqshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -315,7 +417,7 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.8b v0, v0, v1
+; CHECK-NEXT:    srshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -328,7 +430,7 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.4h v0, v0, v1
+; CHECK-NEXT:    srshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -341,7 +443,7 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.2s v0, v0, v1
+; CHECK-NEXT:    srshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -394,10 +496,10 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @srshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: srshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    srshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -411,7 +513,7 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.8b v0, v0, v1
+; CHECK-NEXT:    urshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -424,7 +526,7 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.4h v0, v0, v1
+; CHECK-NEXT:    urshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -437,7 +539,7 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.2s v0, v0, v1
+; CHECK-NEXT:    urshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -490,10 +592,10 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @urshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: urshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    urshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -507,7 +609,7 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.16b v0, v0, v1
+; CHECK-NEXT:    srshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -520,7 +622,7 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.8h v0, v0, v1
+; CHECK-NEXT:    srshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -533,7 +635,7 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.4s v0, v0, v1
+; CHECK-NEXT:    srshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -546,7 +648,7 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.2d v0, v0, v1
+; CHECK-NEXT:    srshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -559,7 +661,7 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.16b v0, v0, v1
+; CHECK-NEXT:    urshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -572,7 +674,7 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.8h v0, v0, v1
+; CHECK-NEXT:    urshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -585,7 +687,7 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.4s v0, v0, v1
+; CHECK-NEXT:    urshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -598,7 +700,7 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.2d v0, v0, v1
+; CHECK-NEXT:    urshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -633,7 +735,7 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.8b v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -646,7 +748,7 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.4h v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -659,7 +761,7 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.2s v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -672,7 +774,7 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.8b v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -685,7 +787,7 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.4h v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -698,7 +800,7 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.2s v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -711,7 +813,7 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.16b v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -724,7 +826,7 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.8h v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -737,7 +839,7 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.4s v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -750,7 +852,7 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.2d v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -803,10 +905,10 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @sqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    sqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -820,7 +922,7 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.16b v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -833,7 +935,7 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.8h v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -846,7 +948,7 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.4s v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -859,7 +961,7 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.2d v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -912,10 +1014,10 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @uqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    uqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -947,77 +1049,126 @@ declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind
 declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @urshr8b(ptr %A) nounwind {
-; CHECK-LABEL: urshr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @urshr4h(ptr %A) nounwind {
-; CHECK-LABEL: urshr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @urshr2s(ptr %A) nounwind {
-; CHECK-LABEL: urshr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @urshr16b(ptr %A) nounwind {
-; CHECK-LABEL: urshr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @urshr8h(ptr %A) nounwind {
-; CHECK-LABEL: urshr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @urshr4s(ptr %A) nounwind {
-; CHECK-LABEL: urshr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @urshr2d(ptr %A) nounwind {
-; CHECK-LABEL: urshr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   ret <2 x i64> %tmp3
@@ -1047,77 +1198,126 @@ define i64 @urshr_scalar(ptr %A) nounwind {
 }
 
 define <8 x i8> @srshr8b(ptr %A) nounwind {
-; CHECK-LABEL: srshr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @srshr4h(ptr %A) nounwind {
-; CHECK-LABEL: srshr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @srshr2s(ptr %A) nounwind {
-; CHECK-LABEL: srshr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @srshr16b(ptr %A) nounwind {
-; CHECK-LABEL: srshr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @srshr8h(ptr %A) nounwind {
-; CHECK-LABEL: srshr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @srshr4s(ptr %A) nounwind {
-; CHECK-LABEL: srshr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @srshr2d(ptr %A) nounwind {
-; CHECK-LABEL: srshr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   ret <2 x i64> %tmp3
@@ -1150,7 +1350,7 @@ define <8 x i8> @sqshlu8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.8b v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.8b, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
@@ -1161,7 +1361,7 @@ define <4 x i16> @sqshlu4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.4h v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.4h, v0.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
@@ -1172,7 +1372,7 @@ define <2 x i32> @sqshlu2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.2s v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.2s, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
@@ -1183,7 +1383,7 @@ define <16 x i8> @sqshlu16b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu16b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.16b v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.16b, v0.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
@@ -1194,7 +1394,7 @@ define <8 x i16> @sqshlu8h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.8h v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -1205,7 +1405,7 @@ define <4 x i32> @sqshlu4s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.4s v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.4s, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -1216,7 +1416,7 @@ define <2 x i64> @sqshlu2d(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.2d v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.2d, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
@@ -1275,7 +1475,7 @@ define <8 x i8> @rshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.8b v0, v0, #1
+; CHECK-NEXT:    rshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1286,7 +1486,7 @@ define <4 x i16> @rshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.4h v0, v0, #1
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1297,7 +1497,7 @@ define <2 x i32> @rshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.2s v0, v0, #1
+; CHECK-NEXT:    rshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1309,7 +1509,7 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.16b v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1323,7 +1523,7 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.8h v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1337,7 +1537,7 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.4s v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1354,7 +1554,7 @@ define <8 x i8> @shrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: shrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.8b v0, v0, #1
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1366,7 +1566,7 @@ define <4 x i16> @shrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: shrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.4h v0, v0, #1
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -1378,7 +1578,7 @@ define <2 x i32> @shrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: shrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.2s v0, v0, #1
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -1391,7 +1591,7 @@ define <16 x i8> @shrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1406,7 +1606,7 @@ define <8 x i16> @shrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.8h v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1421,7 +1621,7 @@ define <4 x i32> @shrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.4s v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1450,7 +1650,7 @@ define <8 x i8> @sqshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.8b v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1461,7 +1661,7 @@ define <4 x i16> @sqshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.4h v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1472,7 +1672,7 @@ define <2 x i32> @sqshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.2s v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1485,7 +1685,7 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.16b v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1499,7 +1699,7 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.8h v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1513,7 +1713,7 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.4s v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1542,7 +1742,7 @@ define <8 x i8> @sqshrun8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.8b v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1553,7 +1753,7 @@ define <4 x i16> @sqshrun4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.4h v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1564,7 +1764,7 @@ define <2 x i32> @sqshrun2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.2s v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1576,7 +1776,7 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.16b v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1590,7 +1790,7 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.8h v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1604,7 +1804,7 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.4s v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1633,7 +1833,7 @@ define <8 x i8> @sqrshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.8b v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1644,7 +1844,7 @@ define <4 x i16> @sqrshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.4h v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1655,7 +1855,7 @@ define <2 x i32> @sqrshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.2s v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1667,7 +1867,7 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.16b v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1681,7 +1881,7 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.8h v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1695,7 +1895,7 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.4s v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1724,7 +1924,7 @@ define <8 x i8> @sqrshrun8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.8b v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1735,7 +1935,7 @@ define <4 x i16> @sqrshrun4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.4h v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1746,7 +1946,7 @@ define <2 x i32> @sqrshrun2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.2s v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1758,7 +1958,7 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.16b v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1772,7 +1972,7 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.8h v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1786,7 +1986,7 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.4s v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1815,7 +2015,7 @@ define <8 x i8> @uqrshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.8b v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1826,7 +2026,7 @@ define <4 x i16> @uqrshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.4h v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1837,7 +2037,7 @@ define <2 x i32> @uqrshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.2s v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1849,7 +2049,7 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.16b v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1863,7 +2063,7 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.8h v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1877,7 +2077,7 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.4s v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1906,7 +2106,7 @@ define <8 x i8> @uqshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.8b v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1917,7 +2117,7 @@ define <4 x i16> @uqshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.4h v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1928,7 +2128,7 @@ define <2 x i32> @uqshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.2s v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1940,7 +2140,7 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.16b v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1954,7 +2154,7 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.8h v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1968,7 +2168,7 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.4s v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1986,7 +2186,7 @@ define <8 x i16> @ushll8h(ptr %A) nounwind {
 ; CHECK-LABEL: ushll8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -1998,7 +2198,7 @@ define <4 x i32> @ushll4s(ptr %A) nounwind {
 ; CHECK-LABEL: ushll4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -2010,7 +2210,7 @@ define <2 x i64> @ushll2d(ptr %A) nounwind {
 ; CHECK-LABEL: ushll2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -2019,11 +2219,18 @@ define <2 x i64> @ushll2d(ptr %A) nounwind {
 }
 
 define <8 x i16> @ushll2_8h(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <16 x i8>, ptr %A
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -2032,11 +2239,18 @@ define <8 x i16> @ushll2_8h(ptr %A) nounwind {
 }
 
 define <4 x i32> @ushll2_4s(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <8 x i16>, ptr %A
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -2045,11 +2259,18 @@ define <4 x i32> @ushll2_4s(ptr %A) nounwind {
 }
 
 define <2 x i64> @ushll2_2d(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <4 x i32>, ptr %A
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -2064,24 +2285,32 @@ declare <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64>, <2 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64>, <1 x i64>)
 declare i64 @llvm.aarch64.neon.ushl.i64(i64, i64)
 
-define <8 x i16> @neon.ushll8h_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll8h_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_ushll8h_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll8h_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll8h_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.8h, #1
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8h_no_constant_shift:
+define <8 x i16> @neon_ushl8h_no_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl8h_no_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushl.8h v0, v0, v0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -2089,36 +2318,76 @@ define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind {
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8h_constant_shift_extend_not_2x:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushl8h_constant_shift_extend_not_2x:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushl8h_constant_shift_extend_not_2x:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    uxtb w8, w8
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    fmov w11, s4
+; CHECK-GI-NEXT:    uxtb w9, w9
+; CHECK-GI-NEXT:    uxtb w10, w10
+; CHECK-GI-NEXT:    uxtb w11, w11
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    mov v2.h[1], w11
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i8>, ptr %A
   %tmp2 = zext <4 x i8> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8_noext_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.8h v0, v0, v0
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_ushl8_noext_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushl8_noext_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushl8_noext_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ushl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll4s_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -2126,13 +2395,21 @@ define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: unnecessary ushll.4s v0, v0, #0?
-define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll4s_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -2140,35 +2417,52 @@ define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <4 x i32> @neon.ushll4s_constant_fold() nounwind {
-; CHECK-LABEL: neon.ushll4s_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI160_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI160_0]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI160_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI160_0]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    adrp x8, .LCPI160_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI160_0]
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <2 x i64> @neon.ushll2d_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll2d_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_ushll2d_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll2d_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll2d_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI161_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI161_0]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl_vscalar_constant_shift:
+define <1 x i64> @neon_ushl_vscalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl_vscalar_constant_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    zip1.2s v0, v0, v1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    shl d0, d0, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <1 x i32>, ptr %A
@@ -2177,8 +2471,8 @@ define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind {
   ret <1 x i64> %tmp3
 }
 
-define i64 @neon.ushl_scalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl_scalar_constant_shift:
+define i64 @neon_ushl_scalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl_scalar_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2195,7 +2489,7 @@ define <8 x i16> @sshll8h(ptr %A) nounwind {
 ; CHECK-LABEL: sshll8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -2207,7 +2501,7 @@ define <2 x i64> @sshll2d(ptr %A) nounwind {
 ; CHECK-LABEL: sshll2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -2222,85 +2516,156 @@ declare <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64>, <2 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64>, <1 x i64>)
 declare i64 @llvm.aarch64.neon.sshl.i64(i64, i64)
 
-define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.16b v0, v0, v0
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp2
 }
 
-define <16 x i8> @neon.sshl16b_non_splat_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_non_splat_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI167_0
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI167_0]
-; CHECK-NEXT:    sshl.16b v0, v0, v1
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_non_splat_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_non_splat_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI167_0
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI167_0]
+; CHECK-SD-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_non_splat_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI167_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI167_0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp2
 }
 
-define <16 x i8> @neon.sshl16b_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sshr.16b v0, v0, #2
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sshr v0.16b, v0.16b, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #254
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
   ret <16 x i8> %tmp2
 }
 
-define <8 x i16> @neon.sshll8h_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll8h_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_sshll8h_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll8h_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll8h_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.8h, #1
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %tmp2, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl4s_wrong_ext_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_wrong_ext_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_wrong_ext_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_wrong_ext_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    fmov w11, s4
+; CHECK-GI-NEXT:    sxtb w9, w9
+; CHECK-GI-NEXT:    sxtb w10, w10
+; CHECK-GI-NEXT:    sxtb w11, w11
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    mov v2.h[1], w11
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i8>, ptr %A
   %tmp2 = sext <4 x i8> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshll4s_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll4s_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshll4s_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll4s_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll4s_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll4s_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshll4s_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll4s_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll4s_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -2308,46 +2673,70 @@ define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <4 x i32> @neon.sshl4s_constant_fold() nounwind {
-; CHECK-LABEL: neon.sshl4s_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI173_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI173_0]
-; CHECK-NEXT:    shl.4s v0, v0, #2
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI173_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI173_0]
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #2
+; CHECK-GI-NEXT:    adrp x8, .LCPI173_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI173_0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl4s_no_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_no_fold(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_no_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_no_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <2 x i64> @neon.sshll2d_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll2d_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshll2d_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll2d_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll2d_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI175_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI175_0]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_vscalar_constant_shift:
+define <1 x i64> @neon_sshll_vscalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_vscalar_constant_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    zip1.2s v0, v0, v1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    shl d0, d0, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <1 x i32>, ptr %A
@@ -2356,8 +2745,8 @@ define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind {
   ret <1 x i64> %tmp3
 }
 
-define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_scalar_constant_shift:
+define i64 @neon_sshll_scalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_scalar_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2370,8 +2759,8 @@ define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind {
   ret i64 %tmp3
 }
 
-define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_scalar_constant_shift_m1:
+define i64 @neon_sshll_scalar_constant_shift_m1(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_scalar_constant_shift_m1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2385,34 +2774,58 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <2 x i64> @neon.sshl2d_constant_fold() nounwind {
-; CHECK-LABEL: neon.sshl2d_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI179_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI179_0]
-; CHECK-NEXT:    add.2d v0, v0, v0
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshl2d_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_sshl2d_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI179_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI179_0]
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl2d_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI179_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI179_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI179_1]
+; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI179_0]
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <2 x i64> @neon.sshl2d_no_fold(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl2d_no_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shl.2d v0, v0, #2
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshl2d_no_fold(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl2d_no_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl2d_no_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI180_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI180_0]
+; CHECK-GI-NEXT:    sshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp2 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 2, i64 2>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i16> @sshll2_8h(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <16 x i8>, ptr %A
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -2421,11 +2834,18 @@ define <8 x i16> @sshll2_8h(ptr %A) nounwind {
 }
 
 define <4 x i32> @sshll2_4s(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <8 x i16>, ptr %A
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -2434,11 +2854,18 @@ define <4 x i32> @sshll2_4s(ptr %A) nounwind {
 }
 
 define <2 x i64> @sshll2_2d(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <4 x i32>, ptr %A
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -2447,88 +2874,145 @@ define <2 x i64> @sshll2_2d(ptr %A) nounwind {
 }
 
 define <8 x i8> @sqshli8b(ptr %A) nounwind {
-; CHECK-LABEL: sqshli8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8b, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @sqshli4h(ptr %A) nounwind {
-; CHECK-LABEL: sqshli4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4h, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @sqshli2s(ptr %A) nounwind {
-; CHECK-LABEL: sqshli2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2s, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @sqshli16b(ptr %A) nounwind {
-; CHECK-LABEL: sqshli16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @sqshli8h(ptr %A) nounwind {
-; CHECK-LABEL: sqshli8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @sqshli4s(ptr %A) nounwind {
-; CHECK-LABEL: sqshli4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @sqshli2d(ptr %A) nounwind {
-; CHECK-LABEL: sqshli2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI190_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI190_0]
+; CHECK-GI-NEXT:    sqshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @uqshli8b(ptr %A) nounwind {
-; CHECK-LABEL: uqshli8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8b, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i8> %tmp3
@@ -2537,9 +3021,9 @@ define <8 x i8> @uqshli8b(ptr %A) nounwind {
 define <8 x i8> @uqshli8b_1(ptr %A) nounwind {
 ; CHECK-LABEL: uqshli8b_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v1, #8
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.8b v0, v0, v1
+; CHECK-NEXT:    movi v0.8b, #8
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqshl v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
@@ -2547,78 +3031,130 @@ define <8 x i8> @uqshli8b_1(ptr %A) nounwind {
 }
 
 define <4 x i16> @uqshli4h(ptr %A) nounwind {
-; CHECK-LABEL: uqshli4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4h, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @uqshli2s(ptr %A) nounwind {
-; CHECK-LABEL: uqshli2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2s, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @uqshli16b(ptr %A) nounwind {
-; CHECK-LABEL: uqshli16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @uqshli8h(ptr %A) nounwind {
-; CHECK-LABEL: uqshli8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @uqshli4s(ptr %A) nounwind {
-; CHECK-LABEL: uqshli4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @uqshli2d(ptr %A) nounwind {
-; CHECK-LABEL: uqshli2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI198_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI198_0]
+; CHECK-GI-NEXT:    uqshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.8b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.8b, v1.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <8 x i8>, ptr %B
@@ -2627,12 +3163,21 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.4h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.4h, v1.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <4 x i16>, ptr %B
@@ -2641,12 +3186,21 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.2s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.2s, v1.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   %tmp4 = load <2 x i32>, ptr %B
@@ -2655,12 +3209,21 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.16b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.16b, v1.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <16 x i8>, ptr %B
@@ -2669,12 +3232,21 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.8h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.8h, v1.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <8 x i16>, ptr %B
@@ -2683,12 +3255,21 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.4s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.4s, v1.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   %tmp4 = load <4 x i32>, ptr %B
@@ -2697,12 +3278,21 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.2d v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.2d, v1.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   %tmp4 = load <2 x i64>, ptr %B
@@ -2740,12 +3330,21 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.8b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.8b, v1.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <8 x i8>, ptr %B
@@ -2754,12 +3353,21 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.4h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.4h, v1.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <4 x i16>, ptr %B
@@ -2768,12 +3376,21 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.2s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.2s, v1.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   %tmp4 = load <2 x i32>, ptr %B
@@ -2782,12 +3399,21 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.16b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.16b, v1.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <16 x i8>, ptr %B
@@ -2796,12 +3422,21 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.8h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.8h, v1.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <8 x i16>, ptr %B
@@ -2810,12 +3445,21 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.4s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.4s, v1.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   %tmp4 = load <4 x i32>, ptr %B
@@ -2824,12 +3468,21 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.2d v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.2d, v1.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   %tmp4 = load <2 x i64>, ptr %B
@@ -2871,7 +3524,7 @@ define <8 x i8> @usra8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.8b v0, v1, #1
+; CHECK-NEXT:    usra v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2885,7 +3538,7 @@ define <4 x i16> @usra4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.4h v0, v1, #1
+; CHECK-NEXT:    usra v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -2899,7 +3552,7 @@ define <2 x i32> @usra2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.2s v0, v1, #1
+; CHECK-NEXT:    usra v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
@@ -2913,7 +3566,7 @@ define <16 x i8> @usra16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.16b v0, v1, #1
+; CHECK-NEXT:    usra v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2927,7 +3580,7 @@ define <8 x i16> @usra8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.8h v0, v1, #1
+; CHECK-NEXT:    usra v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -2941,7 +3594,7 @@ define <4 x i32> @usra4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.4s v0, v1, #1
+; CHECK-NEXT:    usra v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -2955,7 +3608,7 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.2d v0, v1, #1
+; CHECK-NEXT:    usra v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -2965,12 +3618,20 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind {
 }
 
 define <1 x i64> @usra1d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: usra1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra d0, d1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: usra1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    usra d0, d1, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usra1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr x8, [x0]
+; CHECK-GI-NEXT:    ldr x9, [x1]
+; CHECK-GI-NEXT:    add x8, x9, x8, lsr #1
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <1 x i64>, ptr %A
   %tmp3 = lshr <1 x i64> %tmp1, <i64 1>
   %tmp4 = load <1 x i64>, ptr %B
@@ -2983,7 +3644,7 @@ define <8 x i8> @ssra8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.8b v0, v1, #1
+; CHECK-NEXT:    ssra v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2997,7 +3658,7 @@ define <4 x i16> @ssra4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.4h v0, v1, #1
+; CHECK-NEXT:    ssra v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -3011,7 +3672,7 @@ define <2 x i32> @ssra2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.2s v0, v1, #1
+; CHECK-NEXT:    ssra v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
@@ -3025,7 +3686,7 @@ define <16 x i8> @ssra16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.16b v0, v1, #1
+; CHECK-NEXT:    ssra v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3039,7 +3700,7 @@ define <8 x i16> @ssra8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.8h v0, v1, #1
+; CHECK-NEXT:    ssra v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -3053,7 +3714,7 @@ define <4 x i32> @ssra4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.4s v0, v1, #1
+; CHECK-NEXT:    ssra v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -3067,7 +3728,7 @@ define <2 x i64> @ssra2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.2d v0, v1, #1
+; CHECK-NEXT:    ssra v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -3081,8 +3742,8 @@ define <8 x i8> @shr_orr8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.8b v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.8b, v0.8b, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp4 = load <8 x i8>, ptr %B
@@ -3096,8 +3757,8 @@ define <4 x i16> @shr_orr4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.4h v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp4 = load <4 x i16>, ptr %B
@@ -3111,8 +3772,8 @@ define <2 x i32> @shr_orr2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.2s v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp4 = load <2 x i32>, ptr %B
@@ -3126,8 +3787,8 @@ define <16 x i8> @shr_orr16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.16b v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp4 = load <16 x i8>, ptr %B
@@ -3141,8 +3802,8 @@ define <8 x i16> @shr_orr8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.8h v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp4 = load <8 x i16>, ptr %B
@@ -3156,8 +3817,8 @@ define <4 x i32> @shr_orr4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.4s v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp4 = load <4 x i32>, ptr %B
@@ -3171,8 +3832,8 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.2d v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp4 = load <2 x i64>, ptr %B
@@ -3182,13 +3843,21 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.8b v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp4 = load <8 x i8>, ptr %B
   %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3197,13 +3866,21 @@ define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.4h v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp4 = load <4 x i16>, ptr %B
   %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -3212,13 +3889,21 @@ define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.2s v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp4 = load <2 x i32>, ptr %B
   %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
@@ -3227,13 +3912,21 @@ define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.16b v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp4 = load <16 x i8>, ptr %B
   %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3242,13 +3935,21 @@ define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.8h v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp4 = load <8 x i16>, ptr %B
   %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -3257,13 +3958,21 @@ define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp4 = load <4 x i32>, ptr %B
   %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -3272,13 +3981,21 @@ define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.2d v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp4 = load <2 x i64>, ptr %B
   %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
@@ -3287,20 +4004,32 @@ define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @shll(<8 x i8> %in) {
-; CHECK-LABEL: shll:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shll.8h v0, v0, #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shll:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shll:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    ret
   %ext = zext <8 x i8> %in to <8 x i16>
   %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   ret <8 x i16> %res
 }
 
 define <4 x i32> @shll_high(<8 x i16> %in) {
-; CHECK-LABEL: shll_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shll2.4s v0, v0, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shll_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shll_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #16
+; CHECK-GI-NEXT:    ret
   %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %ext = zext <4 x i16> %extract to <4 x i32>
   %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
@@ -3312,7 +4041,7 @@ define <8 x i8> @sli8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.8b v0, v1, #1
+; CHECK-NEXT:    sli v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -3325,7 +4054,7 @@ define <4 x i16> @sli4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.4h v0, v1, #1
+; CHECK-NEXT:    sli v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -3338,7 +4067,7 @@ define <2 x i32> @sli2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.2s v0, v1, #1
+; CHECK-NEXT:    sli v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -3364,7 +4093,7 @@ define <16 x i8> @sli16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.16b v0, v1, #1
+; CHECK-NEXT:    sli v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -3377,7 +4106,7 @@ define <8 x i16> @sli8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.8h v0, v1, #1
+; CHECK-NEXT:    sli v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -3390,7 +4119,7 @@ define <4 x i32> @sli4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.4s v0, v1, #1
+; CHECK-NEXT:    sli v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -3403,7 +4132,7 @@ define <2 x i64> @sli2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.2d v0, v1, #1
+; CHECK-NEXT:    sli v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -3422,21 +4151,37 @@ declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounw
 declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
 
 define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr_v1i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg d1, d1
-; CHECK-NEXT:    sshl d0, d0, d1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v1i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg d1, d1
+; CHECK-SD-NEXT:    sshl d0, d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v1i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    asr x8, x8, x9
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %c = ashr <1 x i64> %a, %b
   ret <1 x i64> %c
 }
 
 define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: sqshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    sqshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3445,11 +4190,19 @@ entry:
 }
 
 define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: uqshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    uqshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3458,11 +4211,19 @@ entry:
 }
 
 define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: srshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    srshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3471,11 +4232,19 @@ entry:
 }
 
 define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: urshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    urshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3486,8 +4255,8 @@ entry:
 define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
 ; CHECK-LABEL: sqshlu_zero_shift_amount:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    sqshlu.2d v0, v0, #0
+; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    sqshlu v0.2d, v0.2d, #0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 entry:
@@ -3498,11 +4267,19 @@ entry:
 }
 
 define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: sshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3511,11 +4288,19 @@ entry:
 }
 
 define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: ushl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3526,8 +4311,8 @@ entry:
 define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: sext_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
-; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3538,8 +4323,8 @@ entry:
 define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: zext_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
-; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3550,9 +4335,9 @@ entry:
 define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: mul_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi.4s v1, #3
-; CHECK-NEXT:    add.4s v0, v0, v1
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
+; CHECK-NEXT:    movi v1.4s, #3
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
 ; CHECK-NEXT:    ret
 entry:
   %b = add <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
@@ -3561,15 +4346,61 @@ entry:
 }
 
 define <8 x i16> @signbits_vashr(<8 x i16> %a)  {
-; CHECK-LABEL: signbits_vashr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v0, v0, #8
-; CHECK-NEXT:    sshr.8h v0, v0, #9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: signbits_vashr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #8
+; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: signbits_vashr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvni v1.8h, #7
+; CHECK-GI-NEXT:    mvni v2.8h, #8
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #7
+; CHECK-GI-NEXT:    ret
   %b = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
   %c = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %b, <8 x i16> <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>)
   %d = ashr <8 x i16> %c, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ret <8 x i16> %d
 }
 
+define <2 x i8> @lshr_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-LABEL: lshr_trunc_v2i64_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #16
+; CHECK-NEXT:    ret
+  %b = lshr <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @ashr_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-LABEL: ashr_trunc_v2i64_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #16
+; CHECK-NEXT:    ret
+  %b = ashr <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @shl_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-SD-LABEL: shl_trunc_v2i64_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_trunc_v2i64_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #16
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+  %b = shl <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 5a5dee0b53d43..4cb1d5b2fb345 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -5,7 +5,7 @@
 
 declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
 
-define fastcc i8 @allocno_reload_assign() {
+define fastcc i8 @allocno_reload_assign(ptr %p) {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
@@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    uzp1 p8.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p8/z, #1 // =0x1
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
@@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p1.b
+; CHECK-NEXT:    punpkhi p4.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
-; CHECK-NEXT:    punpklo p5.h, p0.b
+; CHECK-NEXT:    punpklo p6.h, p0.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    punpkhi p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    punpklo p0.h, p2.b
-; CHECK-NEXT:    punpkhi p1.h, p2.b
-; CHECK-NEXT:    punpklo p2.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    punpkhi p5.h, p5.b
-; CHECK-NEXT:    punpklo p6.h, p7.b
-; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    punpkhi p4.h, p4.b
+; CHECK-NEXT:    punpklo p5.h, p6.b
+; CHECK-NEXT:    punpkhi p6.h, p6.b
+; CHECK-NEXT:    punpklo p7.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
-; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
-; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p3, [z16.d]
+; CHECK-NEXT:    st1b { z3.d }, p4, [z16.d]
+; CHECK-NEXT:    st1b { z4.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p7, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p0, [z16.d]
+; CHECK-NEXT:    str p8, [x0]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
@@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() {
   %constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
   %constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
   call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
+  store <vscale x 16 x i1> %constexpr, ptr %p, align 16
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
new file mode 100644
index 0000000000000..2388a0f206a51
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat (i32 5)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> splat(i32 7), %a
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    cmp w8, #234
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
+; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
+  %ext = extractelement <4 x i1> %fcmp, i32 1
+  ret i1 %ext
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, #2 // =0x2
+; CHECK-NEXT:    b .LBB4_2
+; CHECK-NEXT:  .LBB4_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.eq .LBB4_10
+; CHECK-NEXT:  .LBB4_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:  .LBB4_4: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_6
+; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:  .LBB4_6: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_8
+; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:  .LBB4_8: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mov x11, v1.d[1]
+; CHECK-NEXT:    cmp x11, #14
+; CHECK-NEXT:    b.hi .LBB4_1
+; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    str w10, [x8, #4]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+
+; Negative tests
+
+define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %ins = insertelement <4 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %icmp = icmp ult <4 x i32> %a, %splat
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #235
+; CHECK-NEXT:    adrp x9, .LCPI6_0
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_0]
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    and w0, w9, #0x1
+; CHECK-NEXT:    strb w10, [x8]
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 235)
+  %ext = extractelement <4 x i1> %icmp, i32 1
+  store <4 x i1> %icmp, ptr %p, align 4
+  ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi v1.4s, #127
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #1, #2
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [sp, #8]
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %icmp = icmp ult <4 x i32> %a, splat(i32 127)
+  %ext = extractelement <4 x i1> %icmp, i32 %c
+  ret i1 %ext
+}
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 6a4ab837fc472..531562d3aa678 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -4,7 +4,6 @@
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>)
 declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
@@ -498,21 +497,45 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 }
 
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x9, x1, x5
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x0, x10, x8, vs
-; CHECK-NEXT:    csel x1, x11, x9, vs
-; CHECK-NEXT:    adds x8, x2, x6
-; CHECK-NEXT:    adcs x9, x3, x7
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x2, x10, x8, vs
-; CHECK-NEXT:    csel x3, x11, x9, vs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adds x8, x0, x4
+; CHECK-SD-NEXT:    adcs x9, x1, x5
+; CHECK-SD-NEXT:    asr x10, x9, #63
+; CHECK-SD-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-SD-NEXT:    csel x0, x10, x8, vs
+; CHECK-SD-NEXT:    csel x1, x11, x9, vs
+; CHECK-SD-NEXT:    adds x8, x2, x6
+; CHECK-SD-NEXT:    adcs x9, x3, x7
+; CHECK-SD-NEXT:    asr x10, x9, #63
+; CHECK-SD-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-SD-NEXT:    csel x2, x10, x8, vs
+; CHECK-SD-NEXT:    csel x3, x11, x9, vs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adds x9, x0, x4
+; CHECK-GI-NEXT:    mov w8, wzr
+; CHECK-GI-NEXT:    mov x13, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT:    adcs x10, x1, x5
+; CHECK-GI-NEXT:    asr x11, x10, #63
+; CHECK-GI-NEXT:    cset w12, vs
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    adc x14, x11, x13
+; CHECK-GI-NEXT:    tst w12, #0x1
+; CHECK-GI-NEXT:    csel x0, x11, x9, ne
+; CHECK-GI-NEXT:    csel x1, x14, x10, ne
+; CHECK-GI-NEXT:    adds x9, x2, x6
+; CHECK-GI-NEXT:    adcs x10, x3, x7
+; CHECK-GI-NEXT:    asr x11, x10, #63
+; CHECK-GI-NEXT:    cset w12, vs
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    adc x8, x11, x13
+; CHECK-GI-NEXT:    tst w12, #0x1
+; CHECK-GI-NEXT:    csel x2, x11, x9, ne
+; CHECK-GI-NEXT:    csel x3, x8, x10, ne
+; CHECK-GI-NEXT:    ret
   %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 86a503038766c..be4a5843e8215 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -4,7 +4,6 @@
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>)
 declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
@@ -501,21 +500,45 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 }
 
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x9, x1, x5
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x0, x10, x8, vs
-; CHECK-NEXT:    csel x1, x11, x9, vs
-; CHECK-NEXT:    subs x8, x2, x6
-; CHECK-NEXT:    sbcs x9, x3, x7
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x2, x10, x8, vs
-; CHECK-NEXT:    csel x3, x11, x9, vs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x4
+; CHECK-SD-NEXT:    sbcs x9, x1, x5
+; CHECK-SD-NEXT:    asr x10, x9, #63
+; CHECK-SD-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-SD-NEXT:    csel x0, x10, x8, vs
+; CHECK-SD-NEXT:    csel x1, x11, x9, vs
+; CHECK-SD-NEXT:    subs x8, x2, x6
+; CHECK-SD-NEXT:    sbcs x9, x3, x7
+; CHECK-SD-NEXT:    asr x10, x9, #63
+; CHECK-SD-NEXT:    eor x11, x10, #0x8000000000000000
+; CHECK-SD-NEXT:    csel x2, x10, x8, vs
+; CHECK-SD-NEXT:    csel x3, x11, x9, vs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x9, x0, x4
+; CHECK-GI-NEXT:    mov w8, wzr
+; CHECK-GI-NEXT:    mov x13, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT:    sbcs x10, x1, x5
+; CHECK-GI-NEXT:    asr x11, x10, #63
+; CHECK-GI-NEXT:    cset w12, vs
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    adc x14, x11, x13
+; CHECK-GI-NEXT:    tst w12, #0x1
+; CHECK-GI-NEXT:    csel x0, x11, x9, ne
+; CHECK-GI-NEXT:    csel x1, x14, x10, ne
+; CHECK-GI-NEXT:    subs x9, x2, x6
+; CHECK-GI-NEXT:    sbcs x10, x3, x7
+; CHECK-GI-NEXT:    asr x11, x10, #63
+; CHECK-GI-NEXT:    cset w12, vs
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    adc x8, x11, x13
+; CHECK-GI-NEXT:    tst w12, #0x1
+; CHECK-GI-NEXT:    csel x2, x11, x9, ne
+; CHECK-GI-NEXT:    csel x3, x8, x10, ne
+; CHECK-GI-NEXT:    ret
   %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index d4587c3439967..924bd3981779e 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -4,7 +4,6 @@
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>)
 declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
@@ -492,17 +491,33 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 }
 
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x9, x1, x5
-; CHECK-NEXT:    csinv x0, x8, xzr, lo
-; CHECK-NEXT:    csinv x1, x9, xzr, lo
-; CHECK-NEXT:    adds x8, x2, x6
-; CHECK-NEXT:    adcs x9, x3, x7
-; CHECK-NEXT:    csinv x2, x8, xzr, lo
-; CHECK-NEXT:    csinv x3, x9, xzr, lo
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adds x8, x0, x4
+; CHECK-SD-NEXT:    adcs x9, x1, x5
+; CHECK-SD-NEXT:    csinv x0, x8, xzr, lo
+; CHECK-SD-NEXT:    csinv x1, x9, xzr, lo
+; CHECK-SD-NEXT:    adds x8, x2, x6
+; CHECK-SD-NEXT:    adcs x9, x3, x7
+; CHECK-SD-NEXT:    csinv x2, x8, xzr, lo
+; CHECK-SD-NEXT:    csinv x3, x9, xzr, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adds x8, x0, x4
+; CHECK-GI-NEXT:    adcs x9, x1, x5
+; CHECK-GI-NEXT:    cset w10, hs
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csinv x0, x8, xzr, eq
+; CHECK-GI-NEXT:    csinv x1, x9, xzr, eq
+; CHECK-GI-NEXT:    adds x8, x2, x6
+; CHECK-GI-NEXT:    adcs x9, x3, x7
+; CHECK-GI-NEXT:    cset w10, hs
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csinv x2, x8, xzr, eq
+; CHECK-GI-NEXT:    csinv x3, x9, xzr, eq
+; CHECK-GI-NEXT:    ret
   %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 123f4280bd8ff..a623eb554cac7 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -4,7 +4,6 @@
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>)
 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
@@ -490,17 +489,33 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 }
 
 define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x9, x1, x5
-; CHECK-NEXT:    csel x0, xzr, x8, lo
-; CHECK-NEXT:    csel x1, xzr, x9, lo
-; CHECK-NEXT:    subs x8, x2, x6
-; CHECK-NEXT:    sbcs x9, x3, x7
-; CHECK-NEXT:    csel x2, xzr, x8, lo
-; CHECK-NEXT:    csel x3, xzr, x9, lo
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x4
+; CHECK-SD-NEXT:    sbcs x9, x1, x5
+; CHECK-SD-NEXT:    csel x0, xzr, x8, lo
+; CHECK-SD-NEXT:    csel x1, xzr, x9, lo
+; CHECK-SD-NEXT:    subs x8, x2, x6
+; CHECK-SD-NEXT:    sbcs x9, x3, x7
+; CHECK-SD-NEXT:    csel x2, xzr, x8, lo
+; CHECK-SD-NEXT:    csel x3, xzr, x9, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x4
+; CHECK-GI-NEXT:    sbcs x9, x1, x5
+; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csel x0, xzr, x8, ne
+; CHECK-GI-NEXT:    csel x1, xzr, x9, ne
+; CHECK-GI-NEXT:    subs x8, x2, x6
+; CHECK-GI-NEXT:    sbcs x9, x3, x7
+; CHECK-GI-NEXT:    cset w10, lo
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csel x2, xzr, x8, ne
+; CHECK-GI-NEXT:    csel x3, xzr, x9, ne
+; CHECK-GI-NEXT:    ret
   %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
   ret <2 x i128> %z
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 824d3708c027d..33dd2bd540ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -4,29 +4,15 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -34,18 +20,12 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
@@ -57,13 +37,6 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT:       bb:
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   bb3:
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
index eafc9c644bdbf..2f7c93eb1c0de 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
@@ -186,8 +186,7 @@ define double @fsgnjn_d(double %a, double %b) nounwind {
 ;
 ; CHECKIFD-LABEL: fsgnjn_d:
 ; CHECKIFD:       # %bb.0:
-; CHECKIFD-NEXT:    fneg.d fa5, fa1
-; CHECKIFD-NEXT:    fsgnj.d fa0, fa0, fa5
+; CHECKIFD-NEXT:    fsgnjn.d fa0, fa0, fa1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32I-LABEL: fsgnjn_d:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
index 0d210890c41f9..7fe4d2ef797af 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
@@ -183,8 +183,7 @@ define float @fsgnjn_s(float %a, float %b) nounwind {
 ; CHECKIF-LABEL: fsgnjn_s:
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fadd.s fa5, fa0, fa1
-; CHECKIF-NEXT:    fneg.s fa5, fa5
-; CHECKIF-NEXT:    fsgnj.s fa0, fa0, fa5
+; CHECKIF-NEXT:    fsgnjn.s fa0, fa0, fa5
 ; CHECKIF-NEXT:    ret
 ;
 ; RV32I-LABEL: fsgnjn_s:
diff --git a/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir
new file mode 100644
index 0000000000000..c69bc1b5eca64
--- /dev/null
+++ b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=none %s -o - | FileCheck %s
+# REQUIRES: riscv64-registered-target
+
+# During the MachineVerifier, it assumes that used registers have been defined
+# In this test case, while $v12_v13_v14_v15_v16 covers $v14_v15,
+# $v14_v15 is not a sub-register of $v14m2 even though they share the same register.
+# This corner case can be resolved by checking the register using RegUnit.
+
+...
+---
+name:            func
+tracksRegLiveness: true
+tracksDebugUserValues: true
+body:             |
+  bb.0:
+    liveins: $v0, $v8, $v9, $v10, $v11
+
+    ; CHECK-LABEL: name: func
+    ; CHECK: liveins: $v0, $v8, $v9, $v10, $v11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16
+    renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16
+
+...
diff --git a/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll b/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll
new file mode 100644
index 0000000000000..f7a37015e07fc
--- /dev/null
+++ b/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -mtriple=loongarch64-unknown-linux-gnu -S -passes=inline | FileCheck %s
+; RUN: opt < %s -mtriple=loongarch64-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s
+; Check that we only inline when we have compatible target attributes.
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "loongarch64-unknown-linux-gnu"
+
+define i32 @foo() #0 {
+entry:
+  %call = call i32 (...) @baz()
+  ret i32 %call
+; CHECK-LABEL: foo
+; CHECK: call i32 (...) @baz()
+}
+declare i32 @baz(...) #0
+
+define i32 @bar() #1 {
+entry:
+  %call = call i32 @foo()
+  ret i32 %call
+; CHECK-LABEL: bar
+; CHECK: call i32 (...) @baz()
+}
+
+define i32 @qux() #0 {
+entry:
+  %call = call i32 @bar()
+  ret i32 %call
+; CHECK-LABEL: qux
+; CHECK: call i32 @bar()
+}
+
+attributes #0 = { "target-cpu"="generic-la64" "target-features"="+f,+d" }
+attributes #1 = { "target-cpu"="generic-la64" "target-features"="+f,+d,+lsx,+lasx" }
diff --git a/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg b/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg
new file mode 100644
index 0000000000000..cc24278acbb41
--- /dev/null
+++ b/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "LoongArch" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
new file mode 100644
index 0000000000000..d9c105f753e26
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32:       [[BB2]]:
+; OPT-W32-NEXT:    br label %[[BB3]]
+; OPT-W32:       [[BB3]]:
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64:       [[BB2]]:
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    br label %[[BB3]]
+; OPT-W64:       [[BB3]]:
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, ptr addrspace(1) %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/Transforms/InstCombine/and-fcmp.ll b/llvm/test/Transforms/InstCombine/and-fcmp.ll
index 30b9fca6e97ad..c7bbc8ab56f9a 100644
--- a/llvm/test/Transforms/InstCombine/and-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/and-fcmp.ll
@@ -5044,11 +5044,9 @@ define i1 @isnormal_logical_select_0_fmf1(half %x) {
 
 define i1 @and_fcmp_reassoc1(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @and_fcmp_reassoc1(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = and i1 [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = and i1 [[RETVAL]], [[CMP1]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp ult double %a, %b
   %cmp1 = fcmp ugt double %a, %b
@@ -5059,11 +5057,9 @@ define i1 @and_fcmp_reassoc1(i1 %x, double %a, double %b) {
 
 define i1 @and_fcmp_reassoc2(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @and_fcmp_reassoc2(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = and i1 [[X:%.*]], [[TMP1]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = and i1 [[RETVAL]], [[CMP1]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp ult double %a, %b
   %cmp1 = fcmp ugt double %a, %b
@@ -5074,11 +5070,9 @@ define i1 @and_fcmp_reassoc2(i1 %x, double %a, double %b) {
 
 define i1 @and_fcmp_reassoc3(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @and_fcmp_reassoc3(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = and i1 [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = and i1 [[CMP1]], [[RETVAL]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp ult double %a, %b
   %cmp1 = fcmp ugt double %a, %b
@@ -5089,11 +5083,9 @@ define i1 @and_fcmp_reassoc3(i1 %x, double %a, double %b) {
 
 define i1 @and_fcmp_reassoc4(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @and_fcmp_reassoc4(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = and i1 [[X:%.*]], [[TMP1]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = and i1 [[CMP1]], [[RETVAL]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp ult double %a, %b
   %cmp1 = fcmp ugt double %a, %b
diff --git a/llvm/test/Transforms/InstCombine/eq-of-parts.ll b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
index 00ee7bf643286..d07c2e6a5be52 100644
--- a/llvm/test/Transforms/InstCombine/eq-of-parts.ll
+++ b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
@@ -1441,11 +1441,7 @@ define i1 @ne_optimized_highbits_cmp_todo_overlapping(i32 %x, i32 %y) {
 
 define i1 @and_trunc_i1(i8 %a1, i8 %a2) {
 ; CHECK-LABEL: @and_trunc_i1(
-; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[XOR]], 2
-; CHECK-NEXT:    [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1
-; CHECK-NEXT:    [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]]
+; CHECK-NEXT:    [[AND:%.*]] = icmp eq i8 [[A1:%.*]], [[A2:%.*]]
 ; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %xor = xor i8 %a1, %a2
@@ -1494,10 +1490,7 @@ define i1 @and_trunc_i1_wrong_operands(i8 %a1, i8 %a2, i8 %a3) {
 
 define i1 @or_trunc_i1(i64 %a1, i64 %a2) {
 ; CHECK-LABEL: @or_trunc_i1(
-; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]]
+; CHECK-NEXT:    [[OR:%.*]] = icmp ne i64 [[A2:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %xor = xor i64 %a2, %a1
@@ -1538,3 +1531,28 @@ define i1 @or_trunc_i1_wrong_operands(i64 %a1, i64 %a2, i64 %a3) {
   %or = or i1 %cmp, %trunc
   ret i1 %or
 }
+
+define i1 @jv_identical(i64 %arg1, i64 %arg2) {
+; CHECK-LABEL: @jv_identical(
+; CHECK-NEXT:    [[ARG1_TRUNC:%.*]] = trunc i64 [[ARG1:%.*]] to i8
+; CHECK-NEXT:    [[ARG2_TRUNC:%.*]] = trunc i64 [[ARG2:%.*]] to i8
+; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[ARG1_TRUNC]], [[ARG2_TRUNC]]
+; CHECK-NEXT:    [[DOTUNSHIFTED:%.*]] = xor i64 [[ARG2]], [[ARG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[DOTUNSHIFTED]], 65536
+; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[EQ1]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[AND2]]
+;
+  %arg1.trunc = trunc i64 %arg1 to i8
+  %arg1.shift = lshr i64 %arg1, 16
+  %arg1.shift.trunc = trunc i64 %arg1.shift to i16
+  %arg2.trunc = trunc i64 %arg2 to i8
+  %arg2.shift = lshr i64 %arg2, 16
+  %arg2.shift.trunc = trunc i64 %arg2.shift to i16
+  %eq1 = icmp eq i8 %arg1.trunc, %arg2.trunc
+  %eq2 = icmp eq i16 %arg1.shift.trunc, %arg2.shift.trunc
+  %and1 = and i1 %eq1, %eq2
+  %xor = xor i64 %arg2, %arg1
+  %cmp = icmp ult i64 %xor, 4294967296
+  %and2 = and i1 %cmp, %and1
+  ret i1 %and2
+}
diff --git a/llvm/test/Transforms/InstCombine/fptrunc.ll b/llvm/test/Transforms/InstCombine/fptrunc.ll
index f46940ff060d4..a4296a326c4bc 100644
--- a/llvm/test/Transforms/InstCombine/fptrunc.ll
+++ b/llvm/test/Transforms/InstCombine/fptrunc.ll
@@ -90,6 +90,19 @@ define half @fptrunc_select_true_val_extra_use(half %x, float %y, i1 %cond) {
   ret half %r
 }
 
+define half @fptrunc_max(half %arg) {
+; CHECK-LABEL: @fptrunc_max(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt half [[ARG:%.*]], 0xH0000
+; CHECK-NEXT:    [[NARROW_SEL:%.*]] = select i1 [[CMP]], half 0xH0000, half [[ARG]]
+; CHECK-NEXT:    ret half [[NARROW_SEL]]
+;
+  %ext = fpext half %arg to double
+  %cmp = fcmp olt double %ext, 0.000000e+00
+  %max = select i1 %cmp, double 0.000000e+00, double %ext
+  %trunc = fptrunc double %max to half
+  ret half %trunc
+}
+
 ; Negative test - this would require an extra instruction.
 
 define half @fptrunc_select_true_val_extra_use_2(half %x, float %y, i1 %cond) {
diff --git a/llvm/test/Transforms/InstCombine/or-fcmp.ll b/llvm/test/Transforms/InstCombine/or-fcmp.ll
index a2842f7a45f59..193fe4b5cc722 100644
--- a/llvm/test/Transforms/InstCombine/or-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/or-fcmp.ll
@@ -54,7 +54,7 @@ define i1 @PR41069(double %a, double %b, double %c, double %d) {
 ; CHECK-LABEL: @PR41069(
 ; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[TMP1]], [[UNO1]]
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[UNO1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %uno1 = fcmp uno double %a, %b
@@ -87,7 +87,7 @@ define i1 @PR41069_commute(double %a, double %b, double %c, double %d) {
 ; CHECK-LABEL: @PR41069_commute(
 ; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno double [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[TMP1]], [[UNO1]]
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[UNO1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %uno1 = fcmp uno double %a, %b
@@ -4608,11 +4608,9 @@ define i1 @intersect_fmf_4(double %a, double %b) {
 
 define i1 @or_fcmp_reassoc1(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @or_fcmp_reassoc1(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt double [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = or i1 [[OR]], [[CMP1:%.*]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = or i1 [[RETVAL]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp olt double %a, %b
   %cmp1 = fcmp ogt double %a, %b
@@ -4623,11 +4621,9 @@ define i1 @or_fcmp_reassoc1(i1 %x, double %a, double %b) {
 
 define i1 @or_fcmp_reassoc2(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @or_fcmp_reassoc2(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = or i1 [[X:%.*]], [[TMP1]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = or i1 [[RETVAL]], [[CMP1]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp olt double %a, %b
   %cmp1 = fcmp ogt double %a, %b
@@ -4638,11 +4634,9 @@ define i1 @or_fcmp_reassoc2(i1 %x, double %a, double %b) {
 
 define i1 @or_fcmp_reassoc3(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @or_fcmp_reassoc3(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = or i1 [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = or i1 [[CMP1]], [[RETVAL]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp olt double %a, %b
   %cmp1 = fcmp ogt double %a, %b
@@ -4653,11 +4647,9 @@ define i1 @or_fcmp_reassoc3(i1 %x, double %a, double %b) {
 
 define i1 @or_fcmp_reassoc4(i1 %x, double %a, double %b) {
 ; CHECK-LABEL: @or_fcmp_reassoc4(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[RETVAL:%.*]] = or i1 [[X:%.*]], [[TMP1]]
-; CHECK-NEXT:    [[RETVAL1:%.*]] = or i1 [[CMP1]], [[RETVAL]]
-; CHECK-NEXT:    ret i1 [[RETVAL1]]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
 ;
   %cmp = fcmp olt double %a, %b
   %cmp1 = fcmp ogt double %a, %b
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 405afd5969a41..5c9058b482320 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -276,21 +276,19 @@ if.false:                                    ; preds = %if.true, %entry
 }
 
 ;; Both of successor 0 and successor 1 have a single predecessor.
-;; TODO: Support transform for this case.
-define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
+define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) {
 ; CHECK-LABEL: @single_predecessor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       common.ret:
-; CHECK-NEXT:    ret void
-; CHECK:       if.end:
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
+; CHECK-NEXT:    ret i32 [[DOT]]
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
@@ -298,12 +296,12 @@ entry:
 
 if.end:
   store i32 1, ptr %q
-  ret void
+  ret i32 2
 
 if.then:
   %0 = load i32, ptr %q
   store i32 %0, ptr %p
-  ret void
+  ret i32 3
 }
 
 ;; Hoist 6 stores.
@@ -759,6 +757,44 @@ if.true:
   ret i32 %res
 }
 
+;; Not transform if either BB has multiple successors.
+define i32 @not_multi_successors(i1 %c1, i32 %c2, ptr %p) {
+; CHECK-LABEL: @not_multi_successors(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[ENTRY_IF:%.*]], label [[COMMON_RET:%.*]]
+; CHECK:       entry.if:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    switch i32 [[C2:%.*]], label [[COMMON_RET]] [
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 1, label [[SW_BB]]
+; CHECK-NEXT:    ]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL]], [[ENTRY_IF]] ], [ 0, [[SW_BB]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  br i1 %c1, label %entry.if, label %entry.else
+
+entry.if:                                         ; preds = %entry
+  %val = load i32, ptr %p, align 4
+  switch i32 %c2, label %return [
+  i32 0, label %sw.bb
+  i32 1, label %sw.bb
+  ]
+
+entry.else:                                       ; preds = %entry
+  ret i32 0
+
+sw.bb:                                            ; preds = %entry.if, %entry.if
+  br label %return
+
+return:                                           ; preds = %sw.bb, %entry.if
+  %ret = phi i32 [ %val, %entry.if ], [ 0, %sw.bb ]
+  ret i32 %ret
+}
+
 declare i32 @read_memory_only() readonly nounwind willreturn speculatable
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index c3f35e41b5fc7..7b9910e295df9 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -628,7 +628,7 @@ TEST(MemProf, RadixTreeBuilderEmpty) {
       FrameHistogram =
           llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
                 FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
   const auto Mappings = Builder.takeCallStackPos();
@@ -646,7 +646,7 @@ TEST(MemProf, RadixTreeBuilderOne) {
       FrameHistogram =
           llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
                                            3U, // Size of CS1,
@@ -673,7 +673,7 @@ TEST(MemProf, RadixTreeBuilderTwo) {
       FrameHistogram =
           llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
@@ -711,7 +711,7 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
       FrameHistogram =
           llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+  Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index 2ea589a7c4c3b..8b380751c2f9d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -73,145 +73,6 @@ DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata");
 
 #undef DEFINE_TRIVIAL_LLVM_TYPE
 
-//===----------------------------------------------------------------------===//
-// LLVMStructType.
-//===----------------------------------------------------------------------===//
-
-/// LLVM dialect structure type representing a collection of different-typed
-/// elements manipulated together. Structured can optionally be packed, meaning
-/// that their elements immediately follow each other in memory without
-/// accounting for potential alignment.
-///
-/// Structure types can be identified (named) or literal. Literal structures
-/// are uniquely represented by the list of types they contain and packedness.
-/// Literal structure types are immutable after construction.
-///
-/// Identified structures are uniquely represented by their name, a string. They
-/// have a mutable component, consisting of the list of types they contain,
-/// the packedness and the opacity bits. Identified structs can be created
-/// without providing the lists of element types, making them suitable to
-/// represent recursive, i.e. self-referring, structures. Identified structs
-/// without body are considered opaque. For such structs, one can set the body.
-/// Identified structs can be created as intentionally-opaque, implying that the
-/// caller does not intend to ever set the body (e.g. forward-declarations of
-/// structs from another module) and wants to disallow further modification of
-/// the body. For intentionally-opaque structs or non-opaque structs with the
-/// body, one is not allowed to set another body (however, one can set exactly
-/// the same body).
-///
-/// Note that the packedness of the struct takes place in uniquing of literal
-/// structs, but does not in uniquing of identified structs.
-class LLVMStructType
-    : public Type::TypeBase<LLVMStructType, Type, detail::LLVMStructTypeStorage,
-                            DataLayoutTypeInterface::Trait,
-                            DestructurableTypeInterface::Trait,
-                            TypeTrait::IsMutable> {
-public:
-  /// Inherit base constructors.
-  using Base::Base;
-
-  static constexpr StringLiteral name = "llvm.struct";
-
-  /// Checks if the given type can be contained in a structure type.
-  static bool isValidElementType(Type type);
-
-  /// Gets or creates an identified struct with the given name in the provided
-  /// context. Note that unlike llvm::StructType::create, this function will
-  /// _NOT_ rename a struct in case a struct with the same name already exists
-  /// in the context. Instead, it will just return the existing struct,
-  /// similarly to the rest of MLIR type ::get methods.
-  static LLVMStructType getIdentified(MLIRContext *context, StringRef name);
-  static LLVMStructType
-  getIdentifiedChecked(function_ref<InFlightDiagnostic()> emitError,
-                       MLIRContext *context, StringRef name);
-
-  /// Gets a new identified struct with the given body. The body _cannot_ be
-  /// changed later. If a struct with the given name already exists, renames
-  /// the struct by appending a `.` followed by a number to the name. Renaming
-  /// happens even if the existing struct has the same body.
-  static LLVMStructType getNewIdentified(MLIRContext *context, StringRef name,
-                                         ArrayRef<Type> elements,
-                                         bool isPacked = false);
-
-  /// Gets or creates a literal struct with the given body in the provided
-  /// context.
-  static LLVMStructType getLiteral(MLIRContext *context, ArrayRef<Type> types,
-                                   bool isPacked = false);
-  static LLVMStructType
-  getLiteralChecked(function_ref<InFlightDiagnostic()> emitError,
-                    MLIRContext *context, ArrayRef<Type> types,
-                    bool isPacked = false);
-
-  /// Gets or creates an intentionally-opaque identified struct. Such a struct
-  /// cannot have its body set. To create an opaque struct with a mutable body,
-  /// use `getIdentified`. Note that unlike llvm::StructType::create, this
-  /// function will _NOT_ rename a struct in case a struct with the same name
-  /// already exists in the context. Instead, it will just return the existing
-  /// struct, similarly to the rest of MLIR type ::get methods.
-  static LLVMStructType getOpaque(StringRef name, MLIRContext *context);
-  static LLVMStructType
-  getOpaqueChecked(function_ref<InFlightDiagnostic()> emitError,
-                   MLIRContext *context, StringRef name);
-
-  /// Set the body of an identified struct. Returns failure if the body could
-  /// not be set, e.g. if the struct already has a body or if it was marked as
-  /// intentionally opaque. This might happen in a multi-threaded context when a
-  /// different thread modified the struct after it was created. Most callers
-  /// are likely to assert this always succeeds, but it is possible to implement
-  /// a local renaming scheme based on the result of this call.
-  LogicalResult setBody(ArrayRef<Type> types, bool isPacked);
-
-  /// Checks if a struct is packed.
-  bool isPacked() const;
-
-  /// Checks if a struct is identified.
-  bool isIdentified() const;
-
-  /// Checks if a struct is opaque.
-  bool isOpaque();
-
-  /// Checks if a struct is initialized.
-  bool isInitialized();
-
-  /// Returns the name of an identified struct.
-  StringRef getName();
-
-  /// Returns the list of element types contained in a non-opaque struct.
-  ArrayRef<Type> getBody() const;
-
-  /// Verifies that the type about to be constructed is well-formed.
-  static LogicalResult
-  verifyInvariants(function_ref<InFlightDiagnostic()> emitError, StringRef,
-                   bool);
-  static LogicalResult
-  verifyInvariants(function_ref<InFlightDiagnostic()> emitError,
-                   ArrayRef<Type> types, bool);
-  using Base::verifyInvariants;
-
-  /// Hooks for DataLayoutTypeInterface. Should not be called directly. Obtain a
-  /// DataLayout instance and query it instead.
-  llvm::TypeSize getTypeSizeInBits(const DataLayout &dataLayout,
-                                   DataLayoutEntryListRef params) const;
-
-  uint64_t getABIAlignment(const DataLayout &dataLayout,
-                           DataLayoutEntryListRef params) const;
-
-  uint64_t getPreferredAlignment(const DataLayout &dataLayout,
-                                 DataLayoutEntryListRef params) const;
-
-  bool areCompatible(DataLayoutEntryListRef oldLayout,
-                     DataLayoutEntryListRef newLayout) const;
-
-  LogicalResult verifyEntries(DataLayoutEntryListRef entries,
-                              Location loc) const;
-
-  /// Destructs the struct into its indexed field types.
-  std::optional<DenseMap<Attribute, Type>> getSubelementIndexMap();
-
-  /// Returns which type is stored at a given integer index within the struct.
-  Type getTypeAtIndex(Attribute index);
-};
-
 //===----------------------------------------------------------------------===//
 // Printing and parsing.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
index 09dd0919c318f..e88139fa5b28d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -117,6 +117,140 @@ def LLVMFunctionType : LLVMType<"LLVMFunction", "func"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// LLVMStructType
+//===----------------------------------------------------------------------===//
+
+def LLVMStructType : LLVMType<"LLVMStruct", "struct", [
+  MutableType,
+  DeclareTypeInterfaceMethods<DataLayoutTypeInterface,
+    ["areCompatible", "verifyEntries"]>,
+  DeclareTypeInterfaceMethods<DestructurableTypeInterface,
+    ["getSubelementIndexMap", "getTypeAtIndex"]>
+]> {
+  let summary = "LLVM struct type";
+
+  let description = [{
+    LLVM dialect structure type representing a collection of different-typed
+    elements manipulated together. Struct types can optionally be packed, meaning
+    that their elements immediately follow each other in memory without
+    accounting for potential alignment.
+
+    Structure types can be identified (named) or literal. Literal structures
+    are uniquely represented by the list of types they contain and packedness.
+    Literal structure types are immutable after construction.
+
+    Identified structures are uniquely represented by their name, a string. They
+    have a mutable component, consisting of the list of types they contain,
+    the packedness and the opacity bits. Identified structs can be created
+    without providing the lists of element types, making them suitable to
+    represent recursive, i.e. self-referring, structures. Identified structs
+    without body are considered opaque. For such structs, one can set the body.
+    Identified structs can be created as intentionally-opaque, implying that the
+    caller does not intend to ever set the body (e.g. forward-declarations of
+    structs from another module) and wants to disallow further modification of
+    the body. For intentionally-opaque structs or non-opaque structs with the
+    body, one is not allowed to set another body (however, one can set exactly
+    the same body).
+
+    Note that the packedness of the struct takes place in uniquing of literal
+    structs, but does not in uniquing of identified structs.
+  }];
+
+  // Specify parameters for which TableGen can generate convenient getters for
+  // us.
+  // TODO: Other parameters such as 'packed' or 'opaque' could be added in the
+  //       future iff they generate getters prefixed with 'is', instead of
+  //       'get'. Until then there are no advantages in doing so.
+  let parameters = (ins
+    StringRefParameter<"struct name", [{""}]>:$name,
+    OptionalArrayRefParameter<"mlir::Type">:$body
+  );
+
+  // A custom storage class defined in C++ is required to implement mutability.
+  let storageClass = "LLVMStructTypeStorage";
+  let genStorageClass = 0;
+
+  // We want users to use the more aptly named custom builders below.
+  let skipDefaultBuilders = 1;
+
+  let extraClassDeclaration = [{
+    /// Checks if the given type can be contained in a structure type.
+    static bool isValidElementType(Type type);
+
+    /// Gets or creates an identified struct with the given name in the provided
+    /// context. Note that unlike llvm::StructType::create, this function will
+    /// _NOT_ rename a struct in case a struct with the same name already exists
+    /// in the context. Instead, it will just return the existing struct,
+    /// similarly to the rest of MLIR type ::get methods.
+    static LLVMStructType getIdentified(MLIRContext *context, StringRef name);
+    static LLVMStructType
+    getIdentifiedChecked(function_ref<InFlightDiagnostic()> emitError,
+                         MLIRContext *context, StringRef name);
+
+    /// Gets a new identified struct with the given body. The body _cannot_ be
+    /// changed later. If a struct with the given name already exists, renames
+    /// the struct by appending a `.` followed by a number to the name. Renaming
+    /// happens even if the existing struct has the same body.
+    static LLVMStructType getNewIdentified(MLIRContext *context, StringRef name,
+                                           ArrayRef<Type> elements,
+                                           bool isPacked = false);
+
+    /// Gets or creates a literal struct with the given body in the provided
+    /// context.
+    static LLVMStructType getLiteral(MLIRContext *context, ArrayRef<Type> types,
+                                     bool isPacked = false);
+
+    static LLVMStructType
+    getLiteralChecked(function_ref<InFlightDiagnostic()> emitError,
+                      MLIRContext *context, ArrayRef<Type> types,
+                      bool isPacked = false);
+
+    /// Gets or creates an intentionally-opaque identified struct. Such a struct
+    /// cannot have its body set.
+    /// Note that unlike llvm::StructType::create, this function will _NOT_
+    /// rename a struct in case a struct with the same name
+    /// already exists in the context. Instead, it will just return the existing
+    /// struct, similarly to the rest of MLIR type ::get methods.
+    static LLVMStructType getOpaque(StringRef name, MLIRContext *context);
+
+    static LLVMStructType
+    getOpaqueChecked(function_ref<InFlightDiagnostic()> emitError,
+                     MLIRContext *context, StringRef name);
+
+    /// Set the body of an identified struct. Returns failure if the body could
+    /// not be set, e.g. if the struct already has a body or if it was marked as
+    /// intentionally opaque. This might happen in a multi-threaded context when a
+    /// different thread modified the struct after it was created. Most callers
+    /// are likely to assert this always succeeds, but it is possible to implement
+    /// a local renaming scheme based on the result of this call.
+    LogicalResult setBody(ArrayRef<Type> types, bool isPacked);
+
+    /// Checks if a struct is packed.
+    bool isPacked() const;
+
+    /// Checks if a struct is identified.
+    bool isIdentified() const;
+
+    /// Checks if a struct is opaque.
+    bool isOpaque();
+
+    /// Checks if a struct is initialized.
+    bool isInitialized();
+
+    /// Verifies that the type about to be constructed is well-formed.
+    static LogicalResult
+    verifyInvariants(function_ref<InFlightDiagnostic()> emitError, StringRef,
+                     bool);
+    static LogicalResult
+    verifyInvariants(function_ref<InFlightDiagnostic()> emitError,
+                     ArrayRef<Type> types, bool);
+    using Base::verifyInvariants;
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // LLVMPointerType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/AttrTypeBase.td b/mlir/include/mlir/IR/AttrTypeBase.td
index cbe4f0d67574b..38d38cf098df3 100644
--- a/mlir/include/mlir/IR/AttrTypeBase.td
+++ b/mlir/include/mlir/IR/AttrTypeBase.td
@@ -56,6 +56,9 @@ class ParamNativeTypeTrait<string prop, string params>
 class GenInternalTypeTrait<string prop> : GenInternalTrait<prop, "Type">;
 class PredTypeTrait<string descr, Pred pred> : PredTrait<descr, pred>;
 
+// Trait required to be added to any type which is mutable.
+def MutableType : NativeTypeTrait<"IsMutable">;
+
 //===----------------------------------------------------------------------===//
 // Builders
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index 417c66b9165e3..cc9532f4e33b2 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -1102,11 +1102,11 @@ class PyDenseElementsAttribute
     unpackedBooleans = unpackedBooleans[py::slice(0, numBooleans, 1)];
     unpackedBooleans = equalFunc(unpackedBooleans, 1);
 
-    std::vector<intptr_t> shape;
     MlirType shapedType = mlirAttributeGetType(*this);
     intptr_t rank = mlirShapedTypeGetRank(shapedType);
+    std::vector<intptr_t> shape(rank);
     for (intptr_t i = 0; i < rank; ++i) {
-      shape.push_back(mlirShapedTypeGetDimSize(shapedType, i));
+      shape[i] = mlirShapedTypeGetDimSize(shapedType, i);
     }
     unpackedBooleans = reshapeFunc(unpackedBooleans, shape);
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 9bb0c80749a5f..d30a6b8398f06 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3510,8 +3510,7 @@ void LLVMDialect::initialize() {
            LLVMPPCFP128Type,
            LLVMTokenType,
            LLVMLabelType,
-           LLVMMetadataType,
-           LLVMStructType>();
+           LLVMMetadataType>();
   // clang-format on
   registerTypes();
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 903035a3ec229..655316cc5d66d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -1566,7 +1566,7 @@ DeletionKind LLVM::MemmoveOp::rewire(const DestructurableMemorySlot &slot,
 //===----------------------------------------------------------------------===//
 
 std::optional<DenseMap<Attribute, Type>>
-LLVM::LLVMStructType::getSubelementIndexMap() {
+LLVM::LLVMStructType::getSubelementIndexMap() const {
   Type i32 = IntegerType::get(getContext(), 32);
   DenseMap<Attribute, Type> destructured;
   for (const auto &[index, elemType] : llvm::enumerate(getBody()))
@@ -1574,7 +1574,7 @@ LLVM::LLVMStructType::getSubelementIndexMap() {
   return destructured;
 }
 
-Type LLVM::LLVMStructType::getTypeAtIndex(Attribute index) {
+Type LLVM::LLVMStructType::getTypeAtIndex(Attribute index) const {
   auto indexAttr = llvm::dyn_cast<IntegerAttr>(index);
   if (!indexAttr || !indexAttr.getType().isInteger(32))
     return {};
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 1bed3fa48b30d..33c231e2d2045 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -485,7 +485,7 @@ bool LLVMStructType::isOpaque() {
          (getImpl()->isOpaque() || !getImpl()->isInitialized());
 }
 bool LLVMStructType::isInitialized() { return getImpl()->isInitialized(); }
-StringRef LLVMStructType::getName() { return getImpl()->getIdentifier(); }
+StringRef LLVMStructType::getName() const { return getImpl()->getIdentifier(); }
 ArrayRef<Type> LLVMStructType::getBody() const {
   return isIdentified() ? getImpl()->getIdentifiedStructBody()
                         : getImpl()->getTypeList();
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 5acd095da8e38..710c976281dc3 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -434,23 +434,25 @@ class MoveBlockRewrite : public BlockRewrite {
 class BlockTypeConversionRewrite : public BlockRewrite {
 public:
   BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl,
-                             Block *block, Block *origBlock)
-      : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block),
-        origBlock(origBlock) {}
+                             Block *origBlock, Block *newBlock)
+      : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, origBlock),
+        newBlock(newBlock) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::BlockTypeConversion;
   }
 
-  Block *getOrigBlock() const { return origBlock; }
+  Block *getOrigBlock() const { return block; }
+
+  Block *getNewBlock() const { return newBlock; }
 
   void commit(RewriterBase &rewriter) override;
 
   void rollback() override;
 
 private:
-  /// The original block that was requested to have its signature converted.
-  Block *origBlock;
+  /// The new block that was created as part of this signature conversion.
+  Block *newBlock;
 };
 
 /// Replacing a block argument. This rewrite is not immediately reflected in the
@@ -721,6 +723,18 @@ static bool hasRewrite(R &&rewrites, Operation *op) {
   });
 }
 
+#ifndef NDEBUG
+/// Return "true" if there is a block rewrite that matches the specified
+/// rewrite type and block among the given rewrites.
+template <typename RewriteTy, typename R>
+static bool hasRewrite(R &&rewrites, Block *block) {
+  return any_of(std::forward<R>(rewrites), [&](auto &rewrite) {
+    auto *rewriteTy = dyn_cast<RewriteTy>(rewrite.get());
+    return rewriteTy && rewriteTy->getBlock() == block;
+  });
+}
+#endif // NDEBUG
+
 //===----------------------------------------------------------------------===//
 // ConversionPatternRewriterImpl
 //===----------------------------------------------------------------------===//
@@ -966,12 +980,12 @@ void BlockTypeConversionRewrite::commit(RewriterBase &rewriter) {
   // block.
   if (auto *listener =
           dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener()))
-    for (Operation *op : block->getUsers())
+    for (Operation *op : getNewBlock()->getUsers())
       listener->notifyOperationModified(op);
 }
 
 void BlockTypeConversionRewrite::rollback() {
-  block->replaceAllUsesWith(origBlock);
+  getNewBlock()->replaceAllUsesWith(getOrigBlock());
 }
 
 void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
@@ -1223,6 +1237,9 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
     ConversionPatternRewriter &rewriter, Block *block,
     const TypeConverter *converter,
     TypeConverter::SignatureConversion &signatureConversion) {
+  // A block cannot be converted multiple times.
+  assert(!hasRewrite<BlockTypeConversionRewrite>(rewrites, block) &&
+         "block was already converted");
   OpBuilder::InsertionGuard g(rewriter);
 
   // If no arguments are being changed or added, there is nothing to do.
@@ -1308,7 +1325,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
     appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
   }
 
-  appendRewrite<BlockTypeConversionRewrite>(newBlock, block);
+  appendRewrite<BlockTypeConversionRewrite>(/*origBlock=*/block, newBlock);
 
   // Erase the old block. (It is just unlinked for now and will be erased during
   // cleanup.)
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 830475bed4e44..60108ac86d1ed 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -375,7 +375,7 @@ def TestI32 : Test_Type<"TestI32"> {
 }
 
 def TestRecursiveAlias
-    : Test_Type<"TestRecursiveAlias", [NativeTypeTrait<"IsMutable">]> {
+    : Test_Type<"TestRecursiveAlias", [MutableType]> {
   let mnemonic = "test_rec_alias";
   let storageClass = "TestRecursiveTypeStorage";
   let storageNamespace = "test";
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 74f13788ab29f..51d72d2e5f5b2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4886,7 +4886,9 @@ cc_library(
         ":FuncExtensions",
         ":FuncToLLVM",
         ":FuncTransformOps",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToLLVMIRTranslation",
+        ":GPUToNVVMTransforms",
         ":GPUTransformOps",
         ":IndexToLLVM",
         ":LLVMToLLVMIRTranslation",
@@ -5907,6 +5909,7 @@ cc_library(
         ":ControlFlowDialect",
         ":ControlFlowToLLVM",
         ":ConversionPassIncGen",
+        ":ConvertToLLVMInterface",
         ":FuncDialect",
         ":FuncToLLVM",
         ":GPUCommonTransforms",
@@ -6088,6 +6091,7 @@ cc_library(
     hdrs = [
         "include/mlir/Conversion/GPUCommon/AttrToSPIRVConverter.h",
         "include/mlir/Conversion/GPUCommon/GPUCommonPass.h",
+        "include/mlir/Conversion/GPUCommon/GPUToLLVM.h",
         "lib/Conversion/GPUCommon/GPUOpsLowering.h",
     ],
     includes = ["include"],
@@ -8374,6 +8378,31 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "ToLLVMInterfaceIncGen",
+    tbl_outs = [
+        (
+            ["--gen-attr-interface-decls"],
+            "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.h.inc",
+        ),
+        (
+            ["--gen-attr-interface-defs"],
+            "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.cpp.inc",
+        ),
+        (
+            ["--gen-op-interface-decls"],
+            "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.h.inc",
+        ),
+        (
+            ["--gen-op-interface-defs"],
+            "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Conversion/ConvertToLLVM/ToLLVMInterface.td",
+    deps = [":UBDialectTdFiles"],
+)
+
 cc_library(
     name = "ConvertToLLVMInterface",
     srcs = ["lib/Conversion/ConvertToLLVM/ToLLVMInterface.cpp"],
@@ -8382,6 +8411,7 @@ cc_library(
     deps = [
         ":IR",
         ":Support",
+        ":ToLLVMInterfaceIncGen",
         "//llvm:Support",
     ],
 )
@@ -8394,6 +8424,7 @@ cc_library(
     deps = [
         ":ConversionPassIncGen",
         ":ConvertToLLVMInterface",
+        ":Analysis",
         ":IR",
         ":LLVMCommonConversion",
         ":LLVMDialect",