diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst index 7afad5b15b2d5..e17d741b0a00e 100644 --- a/clang/docs/ClangFormat.rst +++ b/clang/docs/ClangFormat.rst @@ -33,7 +33,7 @@ to format C/C++/Java/JavaScript/JSON/Objective-C/Protobuf/C# code. Clang-format options: --Werror - If set, changes formatting warnings to errors - --Wno-error= - If set don't error out on the specified warning type. + --Wno-error= - If set, don't error out on the specified warning type. =unknown - If set, unknown format options are only warned about. This can be used to enable formatting, even if the configuration contains unknown (newer) options. diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 3f996ceaff156..481362dba3f51 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -290,7 +290,7 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | changes to omp_alloctrait_key enum | :none:`unclaimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| memory model | seq_cst clause on flush construct | :none:`unclaimed` | | +| memory model | seq_cst clause on flush construct | :good:`done` | https://github.com/llvm/llvm-project/pull/114072 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | misc | 'omp_all_memory' keyword and use in 'depend' clause | :good:`done` | D125828, D126321 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 88a9f247c9a5b..5881684a07dad 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -498,6 +498,8 @@ Attribute Changes in Clang - Clang now supports ``[[clang::lifetime_capture_by(X)]]``. Similar to lifetimebound, this can be used to specify when a reference to a function parameter is captured by another capturing entity ``X``. +- The ``target_version`` attribute is now only supported for AArch64 and RISC-V architectures. + Improvements to Clang's diagnostics ----------------------------------- @@ -746,6 +748,7 @@ Bug Fixes to C++ Support assumption if they also occur inside of a dependent lambda. (#GH114787) - Clang now uses valid deduced type locations when diagnosing functions with trailing return type missing placeholder return type. (#GH78694) +- Fixed a bug where bounds of partially expanded pack indexing expressions were checked too early. (#GH116105) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 696a574833dad..1a24b8857674c 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4390,17 +4390,17 @@ class PackIndexingExpr final unsigned TransformedExpressions : 31; LLVM_PREFERRED_TYPE(bool) - unsigned ExpandedToEmptyPack : 1; + unsigned FullySubstituted : 1; PackIndexingExpr(QualType Type, SourceLocation EllipsisLoc, SourceLocation RSquareLoc, Expr *PackIdExpr, Expr *IndexExpr, ArrayRef SubstitutedExprs = {}, - bool ExpandedToEmptyPack = false) + bool FullySubstituted = false) : Expr(PackIndexingExprClass, Type, VK_LValue, OK_Ordinary), EllipsisLoc(EllipsisLoc), RSquareLoc(RSquareLoc), SubExprs{PackIdExpr, IndexExpr}, TransformedExpressions(SubstitutedExprs.size()), - ExpandedToEmptyPack(ExpandedToEmptyPack) { + FullySubstituted(FullySubstituted) { auto *Exprs = getTrailingObjects(); std::uninitialized_copy(SubstitutedExprs.begin(), SubstitutedExprs.end(), @@ -4424,12 +4424,16 @@ class PackIndexingExpr final SourceLocation RSquareLoc, Expr *PackIdExpr, Expr *IndexExpr, std::optional Index, ArrayRef SubstitutedExprs = {}, - bool ExpandedToEmptyPack = false); + bool FullySubstituted = false); static PackIndexingExpr *CreateDeserialized(ASTContext &Context, unsigned NumTransformedExprs); + bool isFullySubstituted() const { return FullySubstituted; } + /// Determine if the expression was expanded to empty. - bool expandsToEmptyPack() const { return ExpandedToEmptyPack; } + bool expandsToEmptyPack() const { + return isFullySubstituted() && TransformedExpressions == 0; + } /// Determine the location of the 'sizeof' keyword. SourceLocation getEllipsisLoc() const { return EllipsisLoc; } diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index 00c87e71bde31..d2f5267e4da5e 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -2670,8 +2670,8 @@ class OMPCompareClause final : public OMPClause { } }; -/// This represents 'seq_cst' clause in the '#pragma omp atomic' -/// directive. +/// This represents 'seq_cst' clause in the '#pragma omp atomic|flush' +/// directives. /// /// \code /// #pragma omp atomic seq_cst diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 1ed5c22361ca6..90a52b1dcbf62 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -5922,12 +5922,12 @@ class PackIndexingType final unsigned Size : 31; LLVM_PREFERRED_TYPE(bool) - unsigned ExpandsToEmptyPack : 1; + unsigned FullySubstituted : 1; protected: friend class ASTContext; // ASTContext creates these. PackIndexingType(const ASTContext &Context, QualType Canonical, - QualType Pattern, Expr *IndexExpr, bool ExpandsToEmptyPack, + QualType Pattern, Expr *IndexExpr, bool FullySubstituted, ArrayRef Expansions = {}); public: @@ -5951,7 +5951,9 @@ class PackIndexingType final bool hasSelectedType() const { return getSelectedIndex() != std::nullopt; } - bool expandsToEmptyPack() const { return ExpandsToEmptyPack; } + bool isFullySubstituted() const { return FullySubstituted; } + + bool expandsToEmptyPack() const { return isFullySubstituted() && Size == 0; } ArrayRef getExpansions() const { return {getExpansionsPtr(), Size}; @@ -5965,10 +5967,10 @@ class PackIndexingType final if (hasSelectedType()) getSelectedType().Profile(ID); else - Profile(ID, Context, getPattern(), getIndexExpr(), expandsToEmptyPack()); + Profile(ID, Context, getPattern(), getIndexExpr(), isFullySubstituted()); } static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context, - QualType Pattern, Expr *E, bool ExpandsToEmptyPack); + QualType Pattern, Expr *E, bool FullySubstituted); private: const QualType *getExpansionsPtr() const { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index a8b9c920b617c..6f1a76bd18fb5 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -473,12 +473,12 @@ let Class = PackIndexingType in { def : Property<"indexExpression", ExprRef> { let Read = [{ node->getIndexExpr() }]; } - def : Property<"expandsToEmptyPack", Bool> { - let Read = [{ node->expandsToEmptyPack() }]; + def : Property<"isFullySubstituted", Bool> { + let Read = [{ node->isFullySubstituted() }]; } def : Creator<[{ - return ctx.getPackIndexingType(pattern, indexExpression, expandsToEmptyPack); + return ctx.getPackIndexingType(pattern, indexExpression, isFullySubstituted); }]>; } diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 634253d003256..14009826f2c55 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -3297,7 +3297,7 @@ def Target : InheritableAttr { }]; } -def TargetVersion : InheritableAttr { +def TargetVersion : InheritableAttr, TargetSpecificAttr> { let Spellings = [GCC<"target_version">]; let Args = [StringArgument<"NamesStr">]; let Subjects = SubjectList<[Function], ErrorDiag>; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0b0759c6b545c..910bd6a344719 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11399,7 +11399,7 @@ def err_omp_atomic_weak_no_equality : Error<"expected '==' operator for 'weak' c def err_omp_atomic_several_clauses : Error< "directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause">; def err_omp_several_mem_order_clauses : Error< - "directive '#pragma omp %0' cannot contain more than one %select{'seq_cst', 'relaxed', |}1'acq_rel', 'acquire' or 'release' clause">; + "directive '#pragma omp %0' cannot contain more than one 'seq_cst',%select{ 'relaxed',|}1 'acq_rel', 'acquire' or 'release' clause">; def err_omp_atomic_incompatible_mem_order_clause : Error< "directive '#pragma omp atomic%select{ %0|}1' cannot be used with '%2' clause">; def note_omp_previous_mem_order_clause : Note< diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index ea6b414618c1d..056fad2cc0ff8 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -4970,12 +4970,11 @@ struct FormatStyle { /// \version 12 std::vector StatementAttributeLikeMacros; - /// A vector of macros that should be interpreted as complete - /// statements. + /// A vector of macros that should be interpreted as complete statements. /// - /// Typical macros are expressions, and require a semi-colon to be - /// added; sometimes this is not the case, and this allows to make - /// clang-format aware of such cases. + /// Typical macros are expressions and require a semicolon to be added. + /// Sometimes this is not the case, and this allows to make clang-format aware + /// of such cases. /// /// For example: Q_UNUSED /// \version 8 diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5fe23e0d0efd3..24abd5d95dd84 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -14257,7 +14257,7 @@ class Sema final : public SemaBase { SourceLocation EllipsisLoc, Expr *IndexExpr, SourceLocation RSquareLoc, ArrayRef ExpandedExprs = {}, - bool EmptyPack = false); + bool FullySubstituted = false); /// Handle a C++1z fold-expression: ( expr op ... op expr ). ExprResult ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 3e6f0d628ca92..80e8c5b9df58e 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -6223,13 +6223,11 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, ArrayRef Expansions, int Index) const { QualType Canonical; - bool ExpandsToEmptyPack = FullySubstituted && Expansions.empty(); if (FullySubstituted && Index != -1) { Canonical = getCanonicalType(Expansions[Index]); } else { llvm::FoldingSetNodeID ID; - PackIndexingType::Profile(ID, *this, Pattern, IndexExpr, - ExpandsToEmptyPack); + PackIndexingType::Profile(ID, *this, Pattern, IndexExpr, FullySubstituted); void *InsertPos = nullptr; PackIndexingType *Canon = DependentPackIndexingTypes.FindNodeOrInsertPos(ID, InsertPos); @@ -6238,7 +6236,7 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, PackIndexingType::totalSizeToAlloc(Expansions.size()), TypeAlignment); Canon = new (Mem) PackIndexingType(*this, QualType(), Pattern, IndexExpr, - ExpandsToEmptyPack, Expansions); + FullySubstituted, Expansions); DependentPackIndexingTypes.InsertNode(Canon, InsertPos); } Canonical = QualType(Canon, 0); @@ -6248,7 +6246,7 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, Allocate(PackIndexingType::totalSizeToAlloc(Expansions.size()), TypeAlignment); auto *T = new (Mem) PackIndexingType(*this, Canonical, Pattern, IndexExpr, - ExpandsToEmptyPack, Expansions); + FullySubstituted, Expansions); Types.push_back(T); return QualType(T, 0); } diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 3d8215ffc8c22..6add18ef4e1af 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1642,22 +1642,8 @@ bool Compiler::VisitImplicitValueInitExpr( if (QT->isIncompleteArrayType()) return true; - if (QT->isArrayType()) { - const ArrayType *AT = QT->getAsArrayTypeUnsafe(); - assert(AT); - const auto *CAT = cast(AT); - size_t NumElems = CAT->getZExtSize(); - PrimType ElemT = classifyPrim(CAT->getElementType()); - - for (size_t I = 0; I != NumElems; ++I) { - if (!this->visitZeroInitializer(ElemT, CAT->getElementType(), E)) - return false; - if (!this->emitInitElem(ElemT, I, E)) - return false; - } - - return true; - } + if (QT->isArrayType()) + return this->visitZeroArrayInitializer(QT, E); if (const auto *ComplexTy = E->getType()->getAs()) { assert(Initializing); @@ -3916,18 +3902,9 @@ bool Compiler::visitZeroRecordInitializer(const Record *R, return false; } } else if (D->isCompositeArray()) { - const Record *ElemRecord = D->ElemDesc->ElemRecord; - assert(D->ElemDesc->ElemRecord); - for (uint32_t I = 0, N = D->getNumElems(); I != N; ++I) { - if (!this->emitConstUint32(I, E)) - return false; - if (!this->emitArrayElemPtr(PT_Uint32, E)) - return false; - if (!this->visitZeroRecordInitializer(ElemRecord, E)) - return false; - if (!this->emitPopPtr(E)) - return false; - } + // Can't be a vector or complex field. + if (!this->visitZeroArrayInitializer(D->getType(), E)) + return false; } else if (D->isRecord()) { if (!this->visitZeroRecordInitializer(D->ElemRecord, E)) return false; @@ -3958,6 +3935,52 @@ bool Compiler::visitZeroRecordInitializer(const Record *R, return true; } +template +bool Compiler::visitZeroArrayInitializer(QualType T, const Expr *E) { + assert(T->isArrayType() || T->isAnyComplexType() || T->isVectorType()); + const ArrayType *AT = T->getAsArrayTypeUnsafe(); + QualType ElemType = AT->getElementType(); + size_t NumElems = cast(AT)->getZExtSize(); + + if (std::optional ElemT = classify(ElemType)) { + for (size_t I = 0; I != NumElems; ++I) { + if (!this->visitZeroInitializer(*ElemT, ElemType, E)) + return false; + if (!this->emitInitElem(*ElemT, I, E)) + return false; + } + return true; + } else if (ElemType->isRecordType()) { + const Record *R = getRecord(ElemType); + + for (size_t I = 0; I != NumElems; ++I) { + if (!this->emitConstUint32(I, E)) + return false; + if (!this->emitArrayElemPtr(PT_Uint32, E)) + return false; + if (!this->visitZeroRecordInitializer(R, E)) + return false; + if (!this->emitPopPtr(E)) + return false; + } + return true; + } else if (ElemType->isArrayType()) { + for (size_t I = 0; I != NumElems; ++I) { + if (!this->emitConstUint32(I, E)) + return false; + if (!this->emitArrayElemPtr(PT_Uint32, E)) + return false; + if (!this->visitZeroArrayInitializer(ElemType, E)) + return false; + if (!this->emitPopPtr(E)) + return false; + } + return true; + } + + return false; +} + template template bool Compiler::emitConst(T Value, PrimType Ty, const Expr *E) { diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index d1b624daba6b9..2a94f5ec76b6c 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -325,6 +325,7 @@ class Compiler : public ConstStmtVisitor, bool>, /// Emits a zero initializer. bool visitZeroInitializer(PrimType T, QualType QT, const Expr *E); bool visitZeroRecordInitializer(const Record *R, const Expr *E); + bool visitZeroArrayInitializer(QualType T, const Expr *E); /// Emits an APSInt constant. bool emitConst(const llvm::APSInt &Value, PrimType Ty, const Expr *E); diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp index 7f02464a1c0f1..20f67d9b1fd42 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.cpp +++ b/clang/lib/AST/ByteCode/InterpFrame.cpp @@ -234,7 +234,12 @@ SourceInfo InterpFrame::getSource(CodePtr PC) const { if (Func && !funcHasUsableBody(Func) && Caller) return Caller->getSource(RetPC); - return S.getSource(Func, PC); + // Similarly, if the resulting source location is invalid anyway, + // point to the caller instead. + SourceInfo Result = S.getSource(Func, PC); + if (Result.getLoc().isInvalid() && Caller) + return Caller->getSource(RetPC); + return Result; } const Expr *InterpFrame::getExpr(CodePtr PC) const { diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp index e37ebec085195..07c4419e3cf40 100644 --- a/clang/lib/AST/ComputeDependence.cpp +++ b/clang/lib/AST/ComputeDependence.cpp @@ -388,9 +388,8 @@ ExprDependence clang::computeDependence(PackIndexingExpr *E) { ExprDependence::Instantiation; ArrayRef Exprs = E->getExpressions(); - if (Exprs.empty()) + if (Exprs.empty() || !E->isFullySubstituted()) D |= PatternDep | ExprDependence::Instantiation; - else if (!E->getIndexExpr()->isInstantiationDependent()) { std::optional Index = E->getSelectedIndex(); assert(Index && *Index < Exprs.size() && "pack index out of bound"); diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 678c9245ab46e..fc09d24fc30cb 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1717,9 +1717,9 @@ NonTypeTemplateParmDecl *SubstNonTypeTemplateParmExpr::getParameter() const { PackIndexingExpr *PackIndexingExpr::Create( ASTContext &Context, SourceLocation EllipsisLoc, SourceLocation RSquareLoc, Expr *PackIdExpr, Expr *IndexExpr, std::optional Index, - ArrayRef SubstitutedExprs, bool ExpandedToEmptyPack) { + ArrayRef SubstitutedExprs, bool FullySubstituted) { QualType Type; - if (Index && !SubstitutedExprs.empty()) + if (Index && FullySubstituted && !SubstitutedExprs.empty()) Type = SubstitutedExprs[*Index]->getType(); else Type = Context.DependentTy; @@ -1728,7 +1728,7 @@ PackIndexingExpr *PackIndexingExpr::Create( Context.Allocate(totalSizeToAlloc(SubstitutedExprs.size())); return new (Storage) PackIndexingExpr(Type, EllipsisLoc, RSquareLoc, PackIdExpr, IndexExpr, - SubstitutedExprs, ExpandedToEmptyPack); + SubstitutedExprs, FullySubstituted); } NamedDecl *PackIndexingExpr::getPackDecl() const { diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index b70f86ef31442..edf20944f0b3e 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -4031,12 +4031,12 @@ void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID, PackIndexingType::PackIndexingType(const ASTContext &Context, QualType Canonical, QualType Pattern, - Expr *IndexExpr, bool ExpandsToEmptyPack, + Expr *IndexExpr, bool FullySubstituted, ArrayRef Expansions) : Type(PackIndexing, Canonical, computeDependence(Pattern, IndexExpr, Expansions)), Context(Context), Pattern(Pattern), IndexExpr(IndexExpr), - Size(Expansions.size()), ExpandsToEmptyPack(ExpandsToEmptyPack) { + Size(Expansions.size()), FullySubstituted(FullySubstituted) { std::uninitialized_copy(Expansions.begin(), Expansions.end(), getTrailingObjects()); @@ -4081,10 +4081,10 @@ PackIndexingType::computeDependence(QualType Pattern, Expr *IndexExpr, void PackIndexingType::Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context, QualType Pattern, - Expr *E, bool ExpandsToEmptyPack) { + Expr *E, bool FullySubstituted) { Pattern.Profile(ID); E->Profile(ID, Context, true); - ID.AddBoolean(ExpandsToEmptyPack); + ID.AddBoolean(FullySubstituted); } UnaryTransformType::UnaryTransformType(QualType BaseType, diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index fd53969e4b3b3..aed86c1fb9955 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -693,17 +693,14 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, bool DisallowLineBreaksOnThisLine = Style.LambdaBodyIndentation == FormatStyle::LBI_Signature && - Style.isCpp() && [&Current] { - // Deal with lambda arguments in C++. The aim here is to ensure that we - // don't over-indent lambda function bodies when lambdas are passed as - // arguments to function calls. We do this by ensuring that either all - // arguments (including any lambdas) go on the same line as the function - // call, or we break before the first argument. - const auto *Prev = Current.Previous; - if (!Prev) - return false; + // Deal with lambda arguments in C++. The aim here is to ensure that we + // don't over-indent lambda function bodies when lambdas are passed as + // arguments to function calls. We do this by ensuring that either all + // arguments (including any lambdas) go on the same line as the function + // call, or we break before the first argument. + Style.isCpp() && [&] { // For example, `/*Newline=*/false`. - if (Prev->is(TT_BlockComment) && Current.SpacesRequiredBefore == 0) + if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0) return false; const auto *PrevNonComment = Current.getPreviousNonComment(); if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren)) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index ddb8ef1314aeb..ada4514f314dd 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -11535,7 +11535,7 @@ StmtResult SemaOpenMP::ActOnOpenMPFlushDirective(ArrayRef Clauses, if (C->getClauseKind() == OMPC_acq_rel || C->getClauseKind() == OMPC_acquire || C->getClauseKind() == OMPC_release || - C->getClauseKind() == OMPC_seq_cst) { + C->getClauseKind() == OMPC_seq_cst /*OpenMP 5.1*/) { if (MemOrderKind != OMPC_unknown) { Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses) << getOpenMPDirectiveName(OMPD_flush) << 1 diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 2ea2a368dd24c..86d15f6324f4f 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -1157,10 +1157,12 @@ ExprResult Sema::ActOnPackIndexingExpr(Scope *S, Expr *PackExpression, return Res; } -ExprResult -Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc, - Expr *IndexExpr, SourceLocation RSquareLoc, - ArrayRef ExpandedExprs, bool EmptyPack) { +ExprResult Sema::BuildPackIndexingExpr(Expr *PackExpression, + SourceLocation EllipsisLoc, + Expr *IndexExpr, + SourceLocation RSquareLoc, + ArrayRef ExpandedExprs, + bool FullySubstituted) { std::optional Index; if (!IndexExpr->isInstantiationDependent()) { @@ -1174,8 +1176,8 @@ Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc, IndexExpr = Res.get(); } - if (Index && (!ExpandedExprs.empty() || EmptyPack)) { - if (*Index < 0 || EmptyPack || *Index >= int64_t(ExpandedExprs.size())) { + if (Index && FullySubstituted) { + if (*Index < 0 || *Index >= int64_t(ExpandedExprs.size())) { Diag(PackExpression->getBeginLoc(), diag::err_pack_index_out_of_bound) << *Index << PackExpression << ExpandedExprs.size(); return ExprError(); @@ -1184,7 +1186,7 @@ Sema::BuildPackIndexingExpr(Expr *PackExpression, SourceLocation EllipsisLoc, return PackIndexingExpr::Create(getASTContext(), EllipsisLoc, RSquareLoc, PackExpression, IndexExpr, Index, - ExpandedExprs, EmptyPack); + ExpandedExprs, FullySubstituted); } TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern( diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 1465bba87724b..9cf1b2d073a90 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -3670,10 +3670,10 @@ class TreeTransform { SourceLocation RSquareLoc, Expr *PackIdExpression, Expr *IndexExpr, ArrayRef ExpandedExprs, - bool EmptyPack = false) { + bool FullySubstituted = false) { return getSema().BuildPackIndexingExpr(PackIdExpression, EllipsisLoc, IndexExpr, RSquareLoc, ExpandedExprs, - EmptyPack); + FullySubstituted); } /// Build a new expression representing a call to a source location @@ -6769,6 +6769,7 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, if (Out.isNull()) return QualType(); SubtitutedTypes.push_back(Out); + FullySubstituted &= !Out->containsUnexpandedParameterPack(); } // If we're supposed to retain a pack expansion, do so by temporarily // forgetting the partially-substituted parameter pack. @@ -15581,6 +15582,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { } SmallVector ExpandedExprs; + bool FullySubstituted = true; if (!E->expandsToEmptyPack() && E->getExpressions().empty()) { Expr *Pattern = E->getPackIdExpression(); SmallVector Unexpanded; @@ -15605,7 +15607,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { return ExprError(); return getDerived().RebuildPackIndexingExpr( E->getEllipsisLoc(), E->getRSquareLoc(), Pack.get(), IndexExpr.get(), - {}); + {}, /*FullySubstituted=*/false); } for (unsigned I = 0; I != *NumExpansions; ++I) { Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); @@ -15617,6 +15619,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { OrigNumExpansions); if (Out.isInvalid()) return true; + FullySubstituted = false; } ExpandedExprs.push_back(Out.get()); } @@ -15633,6 +15636,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { OrigNumExpansions); if (Out.isInvalid()) return true; + FullySubstituted = false; ExpandedExprs.push_back(Out.get()); } } else if (!E->expandsToEmptyPack()) { @@ -15644,8 +15648,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { return getDerived().RebuildPackIndexingExpr( E->getEllipsisLoc(), E->getRSquareLoc(), E->getPackIdExpression(), - IndexExpr.get(), ExpandedExprs, - /*EmptyPack=*/ExpandedExprs.size() == 0); + IndexExpr.get(), ExpandedExprs, FullySubstituted); } template diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index c39a1950a6cf2..731ad0b64dc85 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2191,7 +2191,7 @@ void ASTStmtReader::VisitSizeOfPackExpr(SizeOfPackExpr *E) { void ASTStmtReader::VisitPackIndexingExpr(PackIndexingExpr *E) { VisitExpr(E); E->TransformedExpressions = Record.readInt(); - E->ExpandedToEmptyPack = Record.readInt(); + E->FullySubstituted = Record.readInt(); E->EllipsisLoc = readSourceLocation(); E->RSquareLoc = readSourceLocation(); E->SubExprs[0] = Record.readStmt(); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index e7f567ff59a8a..4994047d9fe10 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2191,7 +2191,7 @@ void ASTStmtWriter::VisitSizeOfPackExpr(SizeOfPackExpr *E) { void ASTStmtWriter::VisitPackIndexingExpr(PackIndexingExpr *E) { VisitExpr(E); Record.push_back(E->TransformedExpressions); - Record.push_back(E->ExpandedToEmptyPack); + Record.push_back(E->FullySubstituted); Record.AddSourceLocation(E->getEllipsisLoc()); Record.AddSourceLocation(E->getRSquareLoc()); Record.AddStmt(E->getPackIdExpression()); diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp index 56f54ff168f3e..7a4fc89a27dac 100644 --- a/clang/test/AST/ByteCode/placement-new.cpp +++ b/clang/test/AST/ByteCode/placement-new.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fexperimental-new-constant-interpreter -verify=expected,both %s -DBYTECODE // RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -verify=ref,both %s namespace std { @@ -338,3 +338,17 @@ namespace PR48606 { } static_assert(f()); } + +#ifdef BYTECODE +constexpr int N = [] // expected-error {{must be initialized by a constant expression}} \ + // expected-note {{assignment to dereferenced one-past-the-end pointer is not allowed in a constant expression}} \ + // expected-note {{in call to}} +{ + struct S { + int a[1]; + }; + S s; + ::new (s.a) int[1][2][3][4](); + return s.a[0]; +}(); +#endif diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 3bc6107b7fd40..c22a43146a8c8 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -1,6 +1,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu // CHECK-LABEL test_wavefrontsize( unsigned test_wavefrontsize() { - // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() + // CHECK-AMDGCN: ret i32 {{[0-9]+}} + // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() return __builtin_amdgcn_wavefrontsize(); } diff --git a/clang/test/OpenMP/flush_ast_print.cpp b/clang/test/OpenMP/flush_ast_print.cpp index 9578ada020227..768282422032f 100644 --- a/clang/test/OpenMP/flush_ast_print.cpp +++ b/clang/test/OpenMP/flush_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER @@ -19,6 +19,7 @@ T tmain(T argc) { #pragma omp flush acq_rel #pragma omp flush acquire #pragma omp flush release +#pragma omp flush seq_cst #pragma omp flush(a) return a + argc; } @@ -27,18 +28,21 @@ T tmain(T argc) { // CHECK-NEXT: #pragma omp flush acq_rel{{$}} // CHECK-NEXT: #pragma omp flush acquire{{$}} // CHECK-NEXT: #pragma omp flush release{{$}} +// CHECK-NEXT: #pragma omp flush seq_cst{{$}} // CHECK-NEXT: #pragma omp flush (a) // CHECK: static int a; // CHECK-NEXT: #pragma omp flush // CHECK-NEXT: #pragma omp flush acq_rel{{$}} // CHECK-NEXT: #pragma omp flush acquire{{$}} // CHECK-NEXT: #pragma omp flush release{{$}} +// CHECK-NEXT: #pragma omp flush seq_cst{{$}} // CHECK-NEXT: #pragma omp flush (a) // CHECK: static char a; // CHECK-NEXT: #pragma omp flush // CHECK-NEXT: #pragma omp flush acq_rel{{$}} // CHECK-NEXT: #pragma omp flush acquire{{$}} // CHECK-NEXT: #pragma omp flush release{{$}} +// CHECK-NEXT: #pragma omp flush seq_cst{{$}} // CHECK-NEXT: #pragma omp flush (a) int main(int argc, char **argv) { @@ -48,11 +52,13 @@ int main(int argc, char **argv) { #pragma omp flush acq_rel #pragma omp flush acquire #pragma omp flush release +#pragma omp flush seq_cst #pragma omp flush(a) // CHECK-NEXT: #pragma omp flush // CHECK-NEXT: #pragma omp flush acq_rel // CHECK-NEXT: #pragma omp flush acquire{{$}} // CHECK-NEXT: #pragma omp flush release +// CHECK-NEXT: #pragma omp flush seq_cst // CHECK-NEXT: #pragma omp flush (a) return tmain(argc) + tmain(argv[0][0]) + a; } diff --git a/clang/test/OpenMP/flush_codegen.cpp b/clang/test/OpenMP/flush_codegen.cpp index c7dd88ef9ac31..fa2586d9fe258 100644 --- a/clang/test/OpenMP/flush_codegen.cpp +++ b/clang/test/OpenMP/flush_codegen.cpp @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER @@ -17,6 +17,7 @@ template T tmain(T argc) { static T a; #pragma omp flush +#pragma omp flush seq_cst #pragma omp flush acq_rel #pragma omp flush acquire #pragma omp flush release @@ -28,6 +29,7 @@ T tmain(T argc) { int main() { static int a; #pragma omp flush +#pragma omp flush seq_cst #pragma omp flush acq_rel #pragma omp flush acquire #pragma omp flush release @@ -37,6 +39,7 @@ int main() { // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) + // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) return tmain(a); // CHECK: call {{.*}} [[TMAIN:@.+]]( // CHECK: ret @@ -48,6 +51,7 @@ int main() { // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) // CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) +// CHECK: call {{.*}}void @__kmpc_flush(ptr {{(@|%).+}}) // CHECK: ret // CHECK-NOT: line: 0, diff --git a/clang/test/OpenMP/flush_messages.cpp b/clang/test/OpenMP/flush_messages.cpp index 3e9142625b274..e78949bc924e1 100644 --- a/clang/test/OpenMP/flush_messages.cpp +++ b/clang/test/OpenMP/flush_messages.cpp @@ -134,14 +134,12 @@ label1 : { #pragma omp flush(argc) flush(argc) // expected-warning {{extra tokens at the end of '#pragma omp flush' are ignored}} #pragma omp parallel flush(argc) // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}} ; -#pragma omp flush seq_cst // omp45-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp flush'}} #pragma omp flush acq_rel // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} #pragma omp flush acquire // omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} #pragma omp flush release // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} #pragma omp flush relaxed // expected-error {{unexpected OpenMP clause 'relaxed' in directive '#pragma omp flush'}} -#pragma omp flush seq_cst // omp45-error {{unexpected OpenMP clause 'seq_cst' in directive '#pragma omp flush'}} -#pragma omp flush acq_rel acquire // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'acq_rel' clause used here}} -#pragma omp flush release acquire // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'release' clause used here}} +#pragma omp flush acq_rel acquire // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'seq_cst', 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'acq_rel' clause used here}} +#pragma omp flush release acquire // omp45-error {{unexpected OpenMP clause 'release' in directive '#pragma omp flush'}} omp45-error {{unexpected OpenMP clause 'acquire' in directive '#pragma omp flush'}} omp51-error {{directive '#pragma omp flush' cannot contain more than one 'seq_cst', 'acq_rel', 'acquire' or 'release' clause}} omp51-note {{'release' clause used here}} #pragma omp flush acq_rel (argc) // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} expected-warning {{extra tokens at the end of '#pragma omp flush' are ignored}} #pragma omp flush(argc) acq_rel // omp45-error {{unexpected OpenMP clause 'acq_rel' in directive '#pragma omp flush'}} omp51-error {{'flush' directive with memory order clause 'acq_rel' cannot have the list}} omp51-note {{memory order clause 'acq_rel' is specified here}} return tmain(argc); diff --git a/clang/test/Sema/attr-target-version-unsupported.c b/clang/test/Sema/attr-target-version-unsupported.c new file mode 100644 index 0000000000000..7cf8172f5272e --- /dev/null +++ b/clang/test/Sema/attr-target-version-unsupported.c @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify %s + +//expected-warning@+1 {{unknown attribute 'target_version' ignored}} +int __attribute__((target_version("aes"))) foo(void) { return 3; } diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp index 962dbb8137f28..cb679a6c3ad87 100644 --- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp +++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp @@ -271,3 +271,37 @@ void f() { } } // namespace GH105903 + +namespace GH116105 { + +template using pack_type = Ts...[Np]; + +template using pack_expr = decltype(Ts...[Np]); + +template struct types; + +template struct indices; + +template struct repack; + +template struct repack> { + template + using pack_type_alias = types...>; + + template + using pack_expr_alias = types...>; +}; + +template struct mdispatch_ { + using Idx = __make_integer_seq; + + static_assert(__is_same( + typename repack::template pack_type_alias, types)); + + static_assert(__is_same( + typename repack::template pack_expr_alias, types)); +}; + +mdispatch_ d; + +} // namespace GH116105 diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp index cc735e4872592..5481bb6b87503 100644 --- a/clang/tools/clang-format/ClangFormat.cpp +++ b/clang/tools/clang-format/ClangFormat.cpp @@ -178,7 +178,7 @@ enum class WNoError { Unknown }; static cl::bits WNoErrorList( "Wno-error", - cl::desc("If set don't error out on the specified warning type."), + cl::desc("If set, don't error out on the specified warning type."), cl::values( clEnumValN(WNoError::Unknown, "unknown", "If set, unknown format options are only warned about.\n" diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index ac81beee11a39..8b8ce1abe906f 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -816,6 +816,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { // mov rax, QWORD PTR [rip + XXXXXXXX] case 0x058d48: // 48 8d 05 XX XX XX XX : // lea rax, QWORD PTR [rip + XXXXXXXX] + case 0x0d8948: // 48 89 0d XX XX XX XX : + // mov QWORD PTR [rip + XXXXXXXX], rcx + case 0x158948: // 48 89 15 XX XX XX XX : + // mov QWORD PTR [rip + XXXXXXXX], rdx case 0x25ff48: // 48 ff 25 XX XX XX XX : // rex.W jmp QWORD PTR [rip + XXXXXXXX] case 0x158D4C: // 4c 8d 15 XX XX XX XX : lea r10, [rip + XX] diff --git a/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90 b/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90 new file mode 100644 index 0000000000000..753e1cfcd7aa5 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/flush-seq-cst.f90 @@ -0,0 +1,6 @@ +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause SEQ_CST in FLUSH construct +program flush_seq_cst + !$omp flush seq_cst +end program \ No newline at end of file diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 406d30b38948e..8dd6d10200cd3 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -1,6 +1,6 @@ ! REQUIRES: openmp_runtime -! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=50 +! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=51 use omp_lib ! Check OpenMP clause validity for the following directives: ! @@ -507,7 +507,6 @@ !$omp flush acquire !ERROR: If memory-order-clause is RELEASE, ACQUIRE, or ACQ_REL, list items must not be specified on the FLUSH directive !$omp flush release (c) - !ERROR: SEQ_CST clause is not allowed on the FLUSH directive !$omp flush seq_cst !ERROR: RELAXED clause is not allowed on the FLUSH directive !$omp flush relaxed diff --git a/flang/test/Semantics/OpenMP/flush02.f90 b/flang/test/Semantics/OpenMP/flush02.f90 index ed0cf6602d574..615332c6cf31c 100644 --- a/flang/test/Semantics/OpenMP/flush02.f90 +++ b/flang/test/Semantics/OpenMP/flush02.f90 @@ -1,6 +1,6 @@ ! REQUIRES: openmp_runtime -! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50 +! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 ! Check OpenMP 5.0 - 2.17.8 flush Construct ! Restriction - @@ -27,7 +27,6 @@ !Only memory-order-clauses. if (omp_get_thread_num() == 1) THEN ! Not allowed clauses. - !ERROR: SEQ_CST clause is not allowed on the FLUSH directive !$omp flush seq_cst !ERROR: RELAXED clause is not allowed on the FLUSH directive !$omp flush relaxed @@ -41,7 +40,6 @@ !$omp flush acquire acquire ! Mix of allowed and not allowed. - !ERROR: SEQ_CST clause is not allowed on the FLUSH directive !$omp flush seq_cst acquire END IF diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 1f69e4c9acbbd..75121285b7b23 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -1094,7 +1094,7 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf, // R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC // sections. if (LLVM_LIKELY(expr == R_ABS) || expr == R_DTPREL || expr == R_GOTPLTREL || - expr == R_RISCV_ADD) { + expr == R_RISCV_ADD || expr == R_ARM_SBREL) { target.relocateNoSym(bufLoc, type, SignExtend64(sym.getVA(ctx, addend))); continue; diff --git a/lld/test/ELF/arm-rwpi-debug-relocs.s b/lld/test/ELF/arm-rwpi-debug-relocs.s new file mode 100644 index 0000000000000..2bb968d4afa9a --- /dev/null +++ b/lld/test/ELF/arm-rwpi-debug-relocs.s @@ -0,0 +1,54 @@ +/// Test that R_ARM_SBREL32 relocations in debug info are relocated as if the +/// static base register (r9) is zero. Real DWARF info will use an expression to +/// add this to the real value of the static base at runtime. + +// REQUIRES: arm +// RUN: rm -rf %t && split-file %s %t && cd %t + +// RUN: llvm-mc -filetype=obj -triple=armv7a asm.s -o obj.o +// RUN: ld.lld -T lds.ld obj.o -o exe.elf 2>&1 | FileCheck %s --implicit-check-not=warning: --allow-empty +// RUN: llvm-objdump -D exe.elf | FileCheck --check-prefix=DISASM %s + +// DISASM-LABEL: : +// DISASM-NEXT: 1000: 0000002a + +// DISASM-LABEL: : +// DISASM-NEXT: 2000: 000004d2 + +// DISASM-LABEL: <.debug_something>: +// DISASM-NEXT: 0: 00001000 +// DISASM-NEXT: ... +// DISASM-NEXT: 104: 00002000 + +//--- lds.ld +SECTIONS { + data1 0x1000 : { *(data1) } + data2 0x2000 : { *(data2) } +} + +//--- asm.s + .text + .type _start,%function + .globl _start +_start: + bx lr + .size _start, .-_start + + .section data1, "aw", %progbits + .type rw,%object + .globl rw +rw: + .long 42 + .size rw, 4 + + .section data2, "aw", %progbits + .type rw2,%object + .globl rw2 +rw2: + .long 1234 + .size rw2, 4 + + .section .debug_something, "", %progbits + .long rw(sbrel) + .space 0x100 + .long rw2(sbrel) diff --git a/lldb/docs/use/aarch64-linux.md b/lldb/docs/use/aarch64-linux.md index 70432f57857a5..393838dc0bb4f 100644 --- a/lldb/docs/use/aarch64-linux.md +++ b/lldb/docs/use/aarch64-linux.md @@ -17,7 +17,7 @@ In LLDB you will be able to see the following new registers: * `z0-z31` vector registers, each one has size equal to the vector length. * `p0-p15` predicate registers, each one containing 1 bit per byte in the vector - length. Making each one vector length / 8 sized. + length. So each one is `vector length in bits / 8` bits. * `ffr` the first fault register, same size as a predicate register. * `vg`, the vector length in "granules". Each granule is 8 bytes. diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py index 660a3c085a908..07b5f8cc7d900 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbutil.py +++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py @@ -1158,17 +1158,6 @@ def GetModuleName(i): return list(map(GetModuleName, list(range(thread.GetNumFrames())))) -def get_stack_frames(thread): - """ - Returns a sequence of stack frames for this thread. - """ - - def GetStackFrame(i): - return thread.GetFrameAtIndex(i) - - return list(map(GetStackFrame, list(range(thread.GetNumFrames())))) - - def print_stacktrace(thread, string_buffer=False): """Prints a simple stack trace of this thread.""" diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp index c18edd10b9681..30c890d6d0138 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp @@ -137,9 +137,19 @@ void DWARFIndex::GetTypesWithQuery( bool DWARFIndex::ProcessTypeDIEMatchQuery( TypeQuery &query, DWARFDIE die, llvm::function_ref callback) { - // Nothing to match from query - if (query.GetContextRef().size() <= 1) + // Check the language, but only if we have a language filter. + if (query.HasLanguage() && + !query.LanguageMatches(SymbolFileDWARF::GetLanguageFamily(*die.GetCU()))) + return true; // Keep iterating over index types, language mismatch. + + // Since mangled names are unique, we only need to check if the names are + // the same. + if (query.GetSearchByMangledName()) { + if (die.GetMangledName(/*substitute_name_allowed=*/false) != + query.GetTypeBasename().GetStringRef()) + return true; // Keep iterating over index types, mangled name mismatch. return callback(die); + } std::vector die_context; if (query.GetModuleSearch()) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 8ce0db4588a46..c900f330b481b 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -2726,39 +2726,8 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) { TypeQuery query_full(query); bool have_index_match = false; m_index->GetTypesWithQuery(query_full, [&](DWARFDIE die) { - // Check the language, but only if we have a language filter. - if (query.HasLanguage()) { - if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU()))) - return true; // Keep iterating over index types, language mismatch. - } - - // Since mangled names are unique, we only need to check if the names are - // the same. - if (query.GetSearchByMangledName()) { - if (die.GetMangledName(/*substitute_name_allowed=*/false) != - query.GetTypeBasename().GetStringRef()) - return true; // Keep iterating over index types, mangled name mismatch. - if (Type *matching_type = ResolveType(die, true, true)) { - results.InsertUnique(matching_type->shared_from_this()); - return !results.Done(query); // Keep iterating if we aren't done. - } - return true; // Keep iterating over index types, weren't able to resolve - // this type - } - - // Check the context matches - std::vector die_context; - if (query.GetModuleSearch()) - die_context = die.GetDeclContext(); - else - die_context = die.GetTypeLookupContext(); - assert(!die_context.empty()); - if (!query.ContextMatches(die_context)) - return true; // Keep iterating over index types, context mismatch. - - // Try to resolve the type. if (Type *matching_type = ResolveType(die, true, true)) { - if (matching_type->IsTemplateType()) { + if (!query.GetSearchByMangledName() && matching_type->IsTemplateType()) { // We have to watch out for case where we lookup a type by basename and // it matches a template with simple template names. Like looking up // "Foo" and if we have simple template names then we will match @@ -2790,7 +2759,7 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) { // With -gsimple-template-names, a templated type's DW_AT_name will not // contain the template parameters. Try again stripping '<' and anything // after, filtering out entries with template parameters that don't match. - if (!have_index_match) { + if (!have_index_match && !query.GetSearchByMangledName()) { // Create a type matcher with a compiler context that is tuned for // -gsimple-template-names. We will use this for the index lookup and the // context matching, but will use the original "match" to insert matches @@ -2804,23 +2773,6 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) { // Copy our match's context and update the basename we are looking for // so we can use this only to compare the context correctly. m_index->GetTypesWithQuery(query_simple, [&](DWARFDIE die) { - // Check the language, but only if we have a language filter. - if (query.HasLanguage()) { - if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU()))) - return true; // Keep iterating over index types, language mismatch. - } - - // Check the context matches - std::vector die_context; - if (query.GetModuleSearch()) - die_context = die.GetDeclContext(); - else - die_context = die.GetTypeLookupContext(); - assert(!die_context.empty()); - if (!query_simple.ContextMatches(die_context)) - return true; // Keep iterating over index types, context mismatch. - - // Try to resolve the type. if (Type *matching_type = ResolveType(die, true, true)) { ConstString name = matching_type->GetQualifiedName(); // We have found a type that still might not match due to template diff --git a/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py b/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py index f1c23a58d1f48..dbb9576ed4d51 100644 --- a/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py +++ b/lldb/test/API/symbol_ondemand/shared_library/TestSharedLibOnDemand.py @@ -59,7 +59,7 @@ def test_source_line_breakpoint(self): lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) thread = process.GetSelectedThread() - stack_frames = lldbutil.get_stack_frames(thread) + stack_frames = thread.frames self.assertGreater(len(stack_frames), 2) leaf_frame = stack_frames[0] @@ -97,7 +97,7 @@ def test_symbolic_breakpoint(self): lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1) thread = process.GetSelectedThread() - stack_frames = lldbutil.get_stack_frames(thread) + stack_frames = thread.frames self.assertGreater(len(stack_frames), 2) leaf_frame = stack_frames[0] diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index 71dd4c6b75b63..c3f98c9bbdff5 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -121,8 +121,8 @@ david.trevelyan@gmail.com (email), [davidtrevelyan](https://github.com/davidtrev #### Parts of code generator not covered by someone else -Evan Cheng \ -evan.cheng@apple.com (email) +Matt Arsenault \ +Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.com/arsenm) (GitHub) #### SelectionDAG @@ -469,6 +469,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn ### Inactive or former component maintainers Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \ +Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \ Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \ ### Former maintainers of removed components diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3670228a10835..fabe2dfd33178 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -21523,9 +21523,9 @@ This is an overloaded intrinsic. :: - declare <16 x i32> @llvm.vp.abs.v16i32 (<16 x i32> , <16 x i1> , i32 , i1 ) - declare @llvm.vp.abs.nxv4i32 ( , , i32 , i1 ) - declare <256 x i64> @llvm.vp.abs.v256i64 (<256 x i64> , <256 x i1> , i32 , i1 ) + declare <16 x i32> @llvm.vp.abs.v16i32 (<16 x i32> , i1 , <16 x i1> , i32 ) + declare @llvm.vp.abs.nxv4i32 ( , i1 , , i32 ) + declare <256 x i64> @llvm.vp.abs.v256i64 (<256 x i64> , i1 , <256 x i1> , i32 ) Overview: """"""""" @@ -21537,12 +21537,12 @@ Arguments: """""""""" The first argument and the result have the same vector of integer type. The -second argument is the vector mask and has the same number of elements as the -result vector type. The third argument is the explicit vector length of the -operation. The fourth argument must be a constant and is a flag to indicate -whether the result value of the '``llvm.vp.abs``' intrinsic is a -:ref:`poison value ` if the first argument is statically or -dynamically an ``INT_MIN`` value. +second argument must be a constant and is a flag to indicate whether the result +value of the '``llvm.vp.abs``' intrinsic is a :ref:`poison value ` +if the first argument is statically or dynamically an ``INT_MIN`` value. The +third argument is the vector mask and has the same number of elements as the +result vector type. The fourth argument is the explicit vector length of the +operation. Semantics: """""""""" @@ -21555,7 +21555,7 @@ Examples: .. code-block:: llvm - %r = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false) + %r = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a, i1 false, <4 x i1> %mask, i32 %evl) ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r %t = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false) @@ -25261,9 +25261,9 @@ This is an overloaded intrinsic. :: - declare <16 x i32> @llvm.vp.ctlz.v16i32 (<16 x i32> , <16 x i1> , i32 , i1 ) - declare @llvm.vp.ctlz.nxv4i32 ( , , i32 , i1 ) - declare <256 x i64> @llvm.vp.ctlz.v256i64 (<256 x i64> , <256 x i1> , i32 , i1 ) + declare <16 x i32> @llvm.vp.ctlz.v16i32 (<16 x i32> , i1 , <16 x i1> , i32 ) + declare @llvm.vp.ctlz.nxv4i32 ( , i1 , , i32 ) + declare <256 x i64> @llvm.vp.ctlz.v256i64 (<256 x i64> , i1 , <256 x i1> , i32 ) Overview: """"""""" @@ -25275,11 +25275,11 @@ Arguments: """""""""" The first argument and the result have the same vector of integer type. The -second argument is the vector mask and has the same number of elements as the -result vector type. The third argument is the explicit vector length of the -operation. The fourth argument is a constant flag that indicates whether the -intrinsic returns a valid result if the first argument is zero. If the first -argument is zero and the fourth argument is true, the result is poison. +second argument is a constant flag that indicates whether the intrinsic returns +a valid result if the first argument is zero. The third argument is the vector +mask and has the same number of elements as the result vector type. the fourth +argument is the explicit vector length of the operation. If the first argument +is zero and the second argument is true, the result is poison. Semantics: """""""""" @@ -25292,7 +25292,7 @@ Examples: .. code-block:: llvm - %r = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false) + %r = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %a, , i1 false, <4 x i1> %mask, i32 %evl) ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r %t = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) @@ -25310,9 +25310,9 @@ This is an overloaded intrinsic. :: - declare <16 x i32> @llvm.vp.cttz.v16i32 (<16 x i32> , <16 x i1> , i32 , i1 ) - declare @llvm.vp.cttz.nxv4i32 ( , , i32 , i1 ) - declare <256 x i64> @llvm.vp.cttz.v256i64 (<256 x i64> , <256 x i1> , i32 , i1 ) + declare <16 x i32> @llvm.vp.cttz.v16i32 (<16 x i32> , i1 , <16 x i1> , i32 ) + declare @llvm.vp.cttz.nxv4i32 ( , i1 , , i32 ) + declare <256 x i64> @llvm.vp.cttz.v256i64 (<256 x i64> , i1 , <256 x i1> , i32 ) Overview: """"""""" @@ -25324,11 +25324,11 @@ Arguments: """""""""" The first argument and the result have the same vector of integer type. The -second argument is the vector mask and has the same number of elements as the -result vector type. The third argument is the explicit vector length of the -operation. The fourth argument is a constant flag that indicates whether the -intrinsic returns a valid result if the first argument is zero. If the first -argument is zero and the fourth argument is true, the result is poison. +second argument is a constant flag that indicates whether the intrinsic +returns a valid result if the first argument is zero. The third argument is +the vector mask and has the same number of elements as the result vector type. +The fourth argument is the explicit vector length of the operation. If the +first argument is zero and the second argument is true, the result is poison. Semantics: """""""""" @@ -25341,7 +25341,7 @@ Examples: .. code-block:: llvm - %r = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl, i1 false) + %r = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %a, i1 false, <4 x i1> %mask, i32 %evl) ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r %t = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index d3d68ff1c6ed2..d2fc40d8ae037 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -277,6 +277,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); } + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // Inline a callee if its target-features are a subset of the callers + // target-features. + return (CallerBits & CalleeBits) == CalleeBits; + } + bool hasBranchDivergence(const Function *F = nullptr) { return false; } bool isSourceOfDivergence(const Value *V) { return false; } diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 73a9586aaa644..dd4d90864a08c 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -1117,21 +1117,20 @@ template class CallStackRadixTreeBuilder { // Encode a call stack into RadixArray. Return the starting index within // RadixArray. - LinearCallStackId - encodeCallStack(const llvm::SmallVector *CallStack, - const llvm::SmallVector *Prev, - std::optional> - MemProfFrameIndexes); + LinearCallStackId encodeCallStack( + const llvm::SmallVector *CallStack, + const llvm::SmallVector *Prev, + const llvm::DenseMap *MemProfFrameIndexes); public: CallStackRadixTreeBuilder() = default; // Build a radix tree array. - void build(llvm::MapVector> - &&MemProfCallStackData, - std::optional> - MemProfFrameIndexes, - llvm::DenseMap &FrameHistogram); + void + build(llvm::MapVector> + &&MemProfCallStackData, + const llvm::DenseMap *MemProfFrameIndexes, + llvm::DenseMap &FrameHistogram); ArrayRef getRadixArray() const { return RadixArray; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index ba5d93f5eb55d..3e1071fe71a2a 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -4363,7 +4363,7 @@ static DenseMap writeMemoryProfileRadixTree( CallStackRadixTreeBuilder Builder; // We don't need a MemProfFrameIndexes map as we have already converted the // full stack id hash to a linear offset into the StackIds array. - Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/std::nullopt, + Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/nullptr, FrameHistogram); Stream.EmitRecord(bitc::FS_CONTEXT_RADIX_TREE_ARRAY, Builder.getRadixArray(), RadixAbbrev); diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 3910046a1652b..b08a93ae9a6d5 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -3033,7 +3033,11 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { if (!MOP.getReg().isPhysical()) continue; - if (llvm::is_contained(TRI->subregs(MOP.getReg()), Reg)) + if (MOP.getReg() != Reg && + all_of(TRI->regunits(Reg), [&](const MCRegUnit RegUnit) { + return llvm::is_contained(TRI->regunits(MOP.getReg()), + RegUnit); + })) Bad = false; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 521829675ae7c..57cce9f785263 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22746,16 +22746,21 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, /// Transform a vector binary operation into a scalar binary operation by moving /// the math/logic after an extract element of a vector. -static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, - const SDLoc &DL, bool LegalOperations) { +static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG, + const SDLoc &DL) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vec = ExtElt->getOperand(0); SDValue Index = ExtElt->getOperand(1); auto *IndexC = dyn_cast(Index); - if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() || + unsigned Opc = Vec.getOpcode(); + if (!IndexC || (!TLI.isBinOp(Opc) && Opc != ISD::SETCC) || !Vec.hasOneUse() || Vec->getNumValues() != 1) return SDValue(); + EVT ResVT = ExtElt->getValueType(0); + if (Opc == ISD::SETCC && ResVT != Vec.getValueType().getVectorElementType()) + return SDValue(); + // Targets may want to avoid this to prevent an expensive register transfer. if (!TLI.shouldScalarizeBinop(Vec)) return SDValue(); @@ -22766,19 +22771,23 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, SDValue Op0 = Vec.getOperand(0); SDValue Op1 = Vec.getOperand(1); APInt SplatVal; - if (isAnyConstantBuildVector(Op0, true) || - ISD::isConstantSplatVector(Op0.getNode(), SplatVal) || - isAnyConstantBuildVector(Op1, true) || - ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) { - // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C' - // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC) - EVT VT = ExtElt->getValueType(0); - SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index); - SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index); - return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1); - } + if (!isAnyConstantBuildVector(Op0, true) && + !ISD::isConstantSplatVector(Op0.getNode(), SplatVal) && + !isAnyConstantBuildVector(Op1, true) && + !ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) + return SDValue(); - return SDValue(); + // extractelt (op X, C), IndexC --> op (extractelt X, IndexC), C' + // extractelt (op C, X), IndexC --> op C', (extractelt X, IndexC) + EVT OpVT = Op0->getValueType(0).getVectorElementType(); + Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index); + Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index); + + if (Opc == ISD::SETCC) + return DAG.getSetCC(DL, ResVT, Op0, Op1, + cast(Vec->getOperand(2))->get()); + else + return DAG.getNode(Opc, DL, ResVT, Op0, Op1); } // Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract, @@ -23011,7 +23020,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } } - if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations)) + if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL)) return BO; if (VecVT.isScalableVector()) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 1480bd98c685e..fbc96bade15f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2330,10 +2330,10 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem); const int Precision = APFloat::semanticsPrecision(FltSem); - const SDValue MaxExp = DAG.getConstant(MaxExpVal, dl, ExpVT); - const SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT); + const SDValue MaxExp = DAG.getSignedConstant(MaxExpVal, dl, ExpVT); + const SDValue MinExp = DAG.getSignedConstant(MinExpVal, dl, ExpVT); - const SDValue DoubleMaxExp = DAG.getConstant(2 * MaxExpVal, dl, ExpVT); + const SDValue DoubleMaxExp = DAG.getSignedConstant(2 * MaxExpVal, dl, ExpVT); const APFloat One(FltSem, "1.0"); APFloat ScaleUpK = scalbn(One, MaxExpVal, APFloat::rmNearestTiesToEven); @@ -2375,7 +2375,7 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDValue IncN0 = DAG.getNode(ISD::ADD, dl, ExpVT, N, Increment0, NUW_NSW); SDValue ClampMinVal = - DAG.getConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT); + DAG.getSignedConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT); SDValue ClampN_Small = DAG.getNode(ISD::SMAX, dl, ExpVT, N, ClampMinVal); SDValue IncN1 = DAG.getNode(ISD::ADD, dl, ExpVT, ClampN_Small, Increment1, NSW); @@ -2385,8 +2385,8 @@ SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDValue ScaleDown1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleDown0, ScaleDownVal); SDValue ScaleDownTwice = DAG.getSetCC( - dl, SetCCVT, N, DAG.getConstant(2 * MinExpVal + Precision, dl, ExpVT), - ISD::SETULT); + dl, SetCCVT, N, + DAG.getSignedConstant(2 * MinExpVal + Precision, dl, ExpVT), ISD::SETULT); SDValue SelectN_Small = DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleDownTwice, IncN1, IncN0); diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 725ff9256fd4a..d8ab18d213e3d 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -351,9 +351,14 @@ bool InstrProfWriter::addMemProfCallStack( bool InstrProfWriter::addMemProfData(memprof::IndexedMemProfData Incoming, function_ref Warn) { - // TODO: Once we remove support for MemProf format Version V1, assert that - // the three components (frames, call stacks, and records) are either all - // empty or populated. + // Return immediately if everything is empty. + if (Incoming.Frames.empty() && Incoming.CallStacks.empty() && + Incoming.Records.empty()) + return true; + + // Otherwise, every component must be non-empty. + assert(!Incoming.Frames.empty() && !Incoming.CallStacks.empty() && + !Incoming.Records.empty()); if (MemProfData.Frames.empty()) MemProfData.Frames = std::move(Incoming.Frames); @@ -636,7 +641,7 @@ writeMemProfCallStackArray( MemProfCallStackIndexes; memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes, FrameHistogram); for (auto I : Builder.getRadixArray()) OS.write32(I); diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 20cc4eececc9b..1c240c3858cc7 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -335,8 +335,7 @@ template LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack( const llvm::SmallVector *CallStack, const llvm::SmallVector *Prev, - std::optional> - MemProfFrameIndexes) { + const llvm::DenseMap *MemProfFrameIndexes) { // Compute the length of the common root prefix between Prev and CallStack. uint32_t CommonLen = 0; if (Prev) { @@ -381,8 +380,7 @@ template void CallStackRadixTreeBuilder::build( llvm::MapVector> &&MemProfCallStackData, - std::optional> - MemProfFrameIndexes, + const llvm::DenseMap *MemProfFrameIndexes, llvm::DenseMap &FrameHistogram) { // Take the vector portion of MemProfCallStackData. The vector is exactly // what we need to sort. Also, we no longer need its lookup capability. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index cb0b9e965277a..d51b36f7e4994 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1348,6 +1348,10 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMinimumJumpTableEntries() const override; bool softPromoteHalfType() const override { return true; } + + bool shouldScalarizeBinop(SDValue VecOp) const override { + return VecOp.getOpcode() == ISD::SETCC; + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ec7bb71fd111f..7a1e401bca18c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -266,16 +266,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, return false; } - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - // Inline a callee if its target-features are a subset of the callers - // target-features. - return (CallerBits & CalleeBits) == CalleeBits; + return BaseT::areInlineCompatible(Caller, Callee); } bool AArch64TTIImpl::areTypesABICompatible( diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 9c1bbafd337e9..ad31f29c04599 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1303,6 +1303,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) .clampMaxNumElements(0, s64, 2) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .moreElementsToNextPow2(0) .lower(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1c738e352b07b..7d78e9cd7eab6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -408,7 +408,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { unsigned AS = cast(N)->getAddressSpace(); if (AS == AMDGPUAS::LOCAL_ADDRESS) { if (Subtarget->ldsRequiresM0Init()) - return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + return glueCopyToM0( + N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32)); } else if (AS == AMDGPUAS::REGION_ADDRESS) { MachineFunction &MF = CurDAG->getMachineFunction(); unsigned Value = MF.getInfo()->getGDSSize(); @@ -1724,7 +1725,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, } VAddr = Addr; - Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); + Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32); return true; } @@ -1832,7 +1833,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } if (SAddr) { - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1848,7 +1849,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } @@ -1903,13 +1904,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, SDValue AddOffset = SAddr.getOpcode() == ISD::TargetFrameIndex ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) - : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32); + : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32); SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32, SAddr, AddOffset), 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); + Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32); return true; } @@ -2058,7 +2059,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, std::optional EncodedOffset = AMDGPU::getSMRDEncodedOffset( *Subtarget, ByteOffset, IsBuffer, HasSOffset); if (EncodedOffset && Offset && !Imm32Only) { - *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); + *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3cc4bd92f6471..d77508227b076 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2333,7 +2333,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue NegOne = DAG.getConstant(-1, DL, VT); + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (VT == MVT::i32) { if (SDValue Res = LowerDIVREM24(Op, DAG, true)) @@ -3794,7 +3794,11 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, if (Width + Offset < 32) { uint32_t Shl = static_cast(Src0) << (32 - Offset - Width); IntTy Result = static_cast(Shl) >> (32 - Width); - return DAG.getConstant(Result, DL, MVT::i32); + if constexpr (std::is_signed_v) { + return DAG.getSignedConstant(Result, DL, MVT::i32); + } else { + return DAG.getConstant(Result, DL, MVT::i32); + } } return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 087de1bed86f7..18a09c39a0638 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_wavefrontsize: { + if (ST->isWaveSizeKnown()) + return IC.replaceInstUsesWith( + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); + break; + } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant. if (!isa(II.getArgOperand(0))) diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 1b88fdd3ab2e1..c2e952418f1be 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -919,7 +919,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); } else if (CompareVT == MVT::i32) { - HWTrue = DAG.getConstant(-1, DL, CompareVT); + HWTrue = DAG.getAllOnesConstant(DL, CompareVT); HWFalse = DAG.getConstant(0, DL, CompareVT); } else { @@ -949,7 +949,7 @@ SDValue R600TargetLowering::lowerADDRSPACECAST(SDValue Op, unsigned DestAS = ASC->getDestAddressSpace(); if (isNullConstant(Op.getOperand(0)) && SrcAS == AMDGPUAS::FLAT_ADDRESS) - return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT); + return DAG.getSignedConstant(TM.getNullPointerValue(DestAS), SL, VT); return Op; } @@ -1750,11 +1750,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), - SelectCC.getOperand(0), // LHS - SelectCC.getOperand(1), // RHS - DAG.getConstant(-1, DL, MVT::i32), // True - DAG.getConstant(0, DL, MVT::i32), // False - SelectCC.getOperand(4)); // CC + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getAllOnesConstant(DL, MVT::i32), // True + DAG.getConstant(0, DL, MVT::i32), // False + SelectCC.getOperand(4)); // CC } // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 73ca59fe320d2..f3b5e6985e8e0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4019,10 +4019,11 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op, Align StackAlign = TFL->getStackAlign(); Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value if (Alignment && *Alignment > StackAlign) { - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Alignment->value() - << Subtarget->getWavefrontSizeLog2(), - dl, VT)); + Tmp1 = DAG.getNode( + ISD::AND, dl, VT, Tmp1, + DAG.getSignedConstant(-(uint64_t)Alignment->value() + << Subtarget->getWavefrontSizeLog2(), + dl, VT)); } Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain @@ -6771,10 +6772,10 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { // TODO: This should be a generic narrowing legalization, and can easily be // for GlobalISel. - SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT); + SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT); SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); - SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT); + SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT); SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); @@ -7542,11 +7543,11 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, SDValue Vec0 = SVN->getOperand(VecIdx0); SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0, - DAG.getConstant(EltIdx0, SL, MVT::i32)); + DAG.getSignedConstant(EltIdx0, SL, MVT::i32)); SDValue Vec1 = SVN->getOperand(VecIdx1); SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1, - DAG.getConstant(EltIdx1, SL, MVT::i32)); + DAG.getSignedConstant(EltIdx1, SL, MVT::i32)); Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1})); } } @@ -9618,7 +9619,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait if (ST.hasSplitBarriers()) { SDValue K = - DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); + DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); SDValue BarSignal = SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, MVT::Other, K, Op.getOperand(0)), @@ -11173,8 +11174,9 @@ SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); - SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, - DAG.getConstant(-1, DL, MVT::i32)); + SDValue SqrtSNextDownInt = + DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getAllOnesConstant(DL, MVT::i32)); SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); SDValue NegSqrtSNextDown = @@ -11296,7 +11298,7 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); - SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); + SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32); SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); @@ -14689,7 +14691,7 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, (CRHS->isZero() && (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(-1, SL, MVT::i1)); + DAG.getAllOnesConstant(SL, MVT::i1)); if ((CRHS->isAllOnes() && (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || (CRHS->isZero() && @@ -14715,7 +14717,7 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, if ((CF == CRHSVal && CC == ISD::SETEQ) || (CT == CRHSVal && CC == ISD::SETNE)) return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(-1, SL, MVT::i1)); + DAG.getAllOnesConstant(SL, MVT::i1)); if ((CF == CRHSVal && CC == ISD::SETNE) || (CT == CRHSVal && CC == ISD::SETEQ)) return LHS.getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 84cb1e48772ca..7f77270a93183 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -816,27 +816,29 @@ def as_i8imm : SDNodeXForm; def as_i8timm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); + return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); }]>; def as_i16imm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); + return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); }]>; def as_i16timm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); + // Explicit cast, as this is used with both signed and unsigned immediates. + return CurDAG->getSignedTargetConstant(int16_t(N->getSExtValue()), SDLoc(N), + MVT::i16); }]>; def as_i32imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; def as_i32timm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; def as_i64imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); + return CurDAG->getSignedTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; def cond_as_i32imm: SDNodeXFormgetValueAPF(); int Log2 = APF.getExactLog2Abs(); assert(Log2 != INT_MIN); - return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32); + return CurDAG->getSignedTargetConstant(Log2, SDLoc(N), MVT::i32); }]>; // Check if a floating point value is a power of 2 floating-point diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6eae756b25fb5..c0021f69f18e7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2093,7 +2093,7 @@ bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? - if (Opc >= ISD::BUILTIN_OP_END) + if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc)) return false; // If the vector op is not supported, try to convert to scalar. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index b01af468d9ea2..2924083ece344 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -285,7 +285,8 @@ def : Pat<(riscv_fclass FPR64:$rs1), (FCLASS_D $rs1)>; def : PatFprFpr; def : PatFprFpr; -def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>; +def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), + (FSGNJN_D FPR64:$rs1, FPR64:$rs2)>; def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2, FRM_RNE))>; def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2, @@ -323,7 +324,7 @@ def : Pat<(riscv_fclass FPR64INX:$rs1), (FCLASS_D_INX $rs1)>; def : PatFprFpr; def : PatFprFpr; def : Pat<(fcopysign FPR64INX:$rs1, (fneg FPR64INX:$rs2)), - (FSGNJN_D_INX $rs1, $rs2)>; + (FSGNJN_D_INX FPR64INX:$rs1, FPR64INX:$rs2)>; def : Pat<(fcopysign FPR64INX:$rs1, FPR32INX:$rs2), (FSGNJ_D_INX $rs1, (f64 (FCVT_D_S_INX $rs2, FRM_RNE)))>; def : Pat<(fcopysign FPR32INX:$rs1, FPR64INX:$rs2), @@ -361,7 +362,7 @@ def : Pat<(riscv_fclass FPR64IN32X:$rs1), (FCLASS_D_IN32X $rs1)>; def : PatFprFpr; def : PatFprFpr; def : Pat<(fcopysign FPR64IN32X:$rs1, (fneg FPR64IN32X:$rs2)), - (FSGNJN_D_IN32X $rs1, $rs2)>; + (FSGNJN_D_IN32X FPR64IN32X:$rs1, FPR64IN32X:$rs2)>; def : Pat<(fcopysign FPR64IN32X:$rs1, FPR32INX:$rs2), (FSGNJ_D_IN32X $rs1, (FCVT_D_S_IN32X $rs2, FRM_RNE))>; def : Pat<(fcopysign FPR32INX:$rs1, FPR64IN32X:$rs2), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 2c27e3950f07f..6c41c53bb301f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -570,7 +570,8 @@ defm : PatFprFpr_m; } let Predicates = [HasStdExtF] in { -def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>; +def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), + (FSGNJN_S FPR32:$rs1, FPR32:$rs2)>; // fmadd: rs1 * rs2 + rs3 def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3), @@ -594,7 +595,8 @@ def : Pat<(fneg (any_fma_nsz FPR32:$rs1, FPR32:$rs2, FPR32:$rs3)), } // Predicates = [HasStdExtF] let Predicates = [HasStdExtZfinx] in { -def : Pat<(fcopysign FPR32INX:$rs1, (fneg FPR32INX:$rs2)), (FSGNJN_S_INX $rs1, $rs2)>; +def : Pat<(fcopysign FPR32INX:$rs1, (fneg FPR32INX:$rs2)), + (FSGNJN_S_INX FPR32INX:$rs1, FPR32INX:$rs2)>; // fmadd: rs1 * rs2 + rs3 def : Pat<(any_fma FPR32INX:$rs1, FPR32INX:$rs2, FPR32INX:$rs3), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index e2e99cc3f2b72..625011c3b9f7c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -291,7 +291,8 @@ def : Pat<(riscv_fclass (f16 FPR16:$rs1)), (FCLASS_H $rs1)>; def : PatFprFpr; def : PatFprFpr; -def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))), (FSGNJN_H $rs1, $rs2)>; +def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))), + (FSGNJN_H FPR16:$rs1, FPR16:$rs2)>; def : Pat<(f16 (fcopysign FPR16:$rs1, FPR32:$rs2)), (FSGNJ_H $rs1, (f16 (FCVT_H_S $rs2, FRM_DYN)))>; @@ -334,7 +335,8 @@ def : Pat<(riscv_fclass FPR16INX:$rs1), (FCLASS_H_INX $rs1)>; def : PatFprFpr; def : PatFprFpr; -def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)), (FSGNJN_H_INX $rs1, $rs2)>; +def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)), + (FSGNJN_H_INX FPR16INX:$rs1, FPR16INX:$rs2)>; def : Pat<(fcopysign FPR16INX:$rs1, FPR32INX:$rs2), (FSGNJ_H_INX $rs1, (FCVT_H_S_INX $rs2, FRM_DYN))>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 45576d515112d..017264fbd46e0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1026,7 +1026,9 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ST->hasVInstructions() && LT.second.isVector()) { // vrsub.vi v10, v8, 0 // vmax.vv v8, v8, v10 - return LT.first * 2; + return LT.first * + getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV}, + LT.second, CostKind); } break; } @@ -1157,6 +1159,16 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(), ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE, CostKind); + case Intrinsic::experimental_vp_splat: { + auto LT = getTypeLegalizationCost(RetTy); + // TODO: Lower i1 experimental_vp_splat + if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1) + return InstructionCost::getInvalid(); + return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint() + ? RISCV::VFMV_V_F + : RISCV::VMV_V_X, + LT.second, CostKind); + } case Intrinsic::vp_reduce_add: case Intrinsic::vp_reduce_fadd: case Intrinsic::vp_reduce_mul: @@ -2330,20 +2342,6 @@ bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { return true; } -bool RISCVTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - // Inline a callee if its target-features are a subset of the callers - // target-features. - return (CallerBits & CalleeBits) == CalleeBits; -} - /// See if \p I should be considered for address type promotion. We check if \p /// I is a sext with right type and used in memory accesses. If it used in a /// "complex" getelementptr, we allow it to be promoted without finding other diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 498f48353dc0c..6fd36e90a02dd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -60,9 +60,6 @@ class RISCVTTIImpl : public BasicTTIImplBase { : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} - bool areInlineCompatible(const Function *Caller, - const Function *Callee) const; - /// Return the cost of materializing an immediate for a value operand of /// a store instruction. InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 90d7bd934af40..403d238aa5b52 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -671,7 +671,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM, } // Lower the displacement to a TargetConstant. - Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(Base), VT); + Disp = CurDAG->getSignedTargetConstant(AM.Disp, SDLoc(Base), VT); } void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM, @@ -2024,8 +2024,9 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) { CurDAG->getConstant(IPM.XORValue, DL, MVT::i32)); if (IPM.AddValue) - Result = CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result, - CurDAG->getConstant(IPM.AddValue, DL, MVT::i32)); + Result = + CurDAG->getNode(ISD::ADD, DL, MVT::i32, Result, + CurDAG->getSignedConstant(IPM.AddValue, DL, MVT::i32)); EVT VT = Node->getValueType(0); if (VT == MVT::i32 && IPM.Bit == 31) { diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 78d91299a357d..8f505b7e198cf 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1444,15 +1444,15 @@ void SystemZTargetLowering::LowerAsmOperandForConstraint( case 'K': // Signed 16-bit constant if (auto *C = dyn_cast(Op)) if (isInt<16>(C->getSExtValue())) - Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), - Op.getValueType())); + Ops.push_back(DAG.getSignedTargetConstant( + C->getSExtValue(), SDLoc(Op), Op.getValueType())); return; case 'L': // Signed 20-bit displacement (on all targets we support) if (auto *C = dyn_cast(Op)) if (isInt<20>(C->getSExtValue())) - Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), - Op.getValueType())); + Ops.push_back(DAG.getSignedTargetConstant( + C->getSExtValue(), SDLoc(Op), Op.getValueType())); return; case 'M': // 0x7fffffff @@ -2578,7 +2578,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL, // Make sure that the second operand is an i32 with the right value. if (C.Op1.getValueType() != MVT::i32 || Value != ConstOp1->getZExtValue()) - C.Op1 = DAG.getConstant(Value, DL, MVT::i32); + C.Op1 = DAG.getConstant((uint32_t)Value, DL, MVT::i32); } // Return true if Op is either an unextended load, or a load suitable @@ -3410,7 +3410,7 @@ SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG, } if (Invert) { SDValue Mask = - DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64)); + DAG.getSplatBuildVector(VT, DL, DAG.getAllOnesConstant(DL, MVT::i64)); Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask); } if (Chain && Chain.getNode() != Cmp.getNode()) { @@ -3571,7 +3571,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result, - DAG.getConstant(Offset, DL, PtrVT)); + DAG.getSignedConstant(Offset, DL, PtrVT)); return Result; } @@ -3834,7 +3834,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op, const auto *TFL = Subtarget.getFrameLowering(); int Offset = TFL->getReturnAddressOffset(MF); SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, FrameAddr, - DAG.getConstant(Offset, DL, PtrVT)); + DAG.getSignedConstant(Offset, DL, PtrVT)); return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); } @@ -4584,7 +4584,7 @@ static void getCSAddressAndShifts(SDValue Addr, SelectionDAG &DAG, SDLoc DL, // Get the address of the containing word. AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr, - DAG.getConstant(-4, DL, PtrVT)); + DAG.getSignedConstant(-4, DL, PtrVT)); // Get the number of bits that the word must be rotated left in order // to bring the field to the top bits of a GR32. @@ -4623,7 +4623,8 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, if (Opcode == SystemZISD::ATOMIC_LOADW_SUB) if (auto *Const = dyn_cast(Src2)) { Opcode = SystemZISD::ATOMIC_LOADW_ADD; - Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType()); + Src2 = DAG.getSignedConstant(-Const->getSExtValue(), DL, + Src2.getValueType()); } SDValue AlignedAddr, BitShift, NegBitShift; diff --git a/llvm/lib/Target/SystemZ/SystemZOperands.td b/llvm/lib/Target/SystemZ/SystemZOperands.td index 0221e2c53f2f4..64345ca3a1394 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperands.td +++ b/llvm/lib/Target/SystemZ/SystemZOperands.td @@ -220,8 +220,8 @@ def NEGLF32 : SDNodeXFormgetTargetConstant(int8_t(N->getZExtValue()), SDLoc(N), - MVT::i64); + return CurDAG->getSignedTargetConstant(int8_t(N->getZExtValue()), SDLoc(N), + MVT::i64); }]>; // Truncate an immediate to a 8-bit unsigned quantity. @@ -244,14 +244,14 @@ def UIMM12 : SDNodeXFormgetTargetConstant(int16_t(N->getZExtValue()), SDLoc(N), - MVT::i64); + return CurDAG->getSignedTargetConstant(int16_t(N->getZExtValue()), SDLoc(N), + MVT::i64); }]>; // Negate and then truncate an immediate to a 16-bit signed quantity. def NEGSIMM16 : SDNodeXFormgetTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N), - MVT::i64); + return CurDAG->getSignedTargetConstant(int16_t(-N->getZExtValue()), SDLoc(N), + MVT::i64); }]>; // Truncate an immediate to a 16-bit unsigned quantity. @@ -268,8 +268,8 @@ def SIMM32 : SDNodeXFormgetTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N), - MVT::i64); + return CurDAG->getSignedTargetConstant(int32_t(-N->getZExtValue()), SDLoc(N), + MVT::i64); }]>; // Truncate an immediate to a 32-bit unsigned quantity. diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 4eb58e27f7ad7..c182c9890509f 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -53,7 +53,7 @@ static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, int64_t Adj = getMemMemLenAdj(Op); SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64, DAG.getZExtOrTrunc(Size, DL, MVT::i64), - DAG.getConstant(0 - Adj, DL, MVT::i64)); + DAG.getSignedConstant(0 - Adj, DL, MVT::i64)); return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 2d00889407ff4..a52af6832d583 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -429,7 +429,7 @@ bool WebAssemblyTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? - if (Opc >= ISD::BUILTIN_OP_END) + if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc)) return false; // If the vector op is not supported, try to convert to scalar. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index f96e3232b93f4..3d678e5384166 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -104,24 +104,6 @@ TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle( return TTI::ReductionShuffle::SplitHalf; } -bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { - // Allow inlining only when the Callee has a subset of the Caller's - // features. In principle, we should be able to inline regardless of any - // features because WebAssembly supports features at module granularity, not - // function granularity, but without this restriction it would be possible for - // a module to "forget" about features if all the functions that used them - // were inlined. - const TargetMachine &TM = getTLI()->getTargetMachine(); - - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); - - return (CallerBits & CalleeBits) == CalleeBits; -} - void WebAssemblyTTIImpl::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 2ce6cbf3ba026..9691120b2e531 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -72,9 +72,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { TTI::ReductionShuffle getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; - bool areInlineCompatible(const Function *Caller, - const Function *Callee) const; - bool supportsTailCalls() const; bool isProfitableToSinkOperands(Instruction *I, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9048d1d83f187..d3b569eccc17b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3300,7 +3300,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? - if (Opc >= ISD::BUILTIN_OP_END) + if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc)) return false; // If the vector op is not supported, try to convert to scalar. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index e2eae7fb8327c..b4033fc2a418a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1185,14 +1185,27 @@ static Value *extractIntPart(const IntPart &P, IRBuilderBase &Builder) { /// (icmp eq X0, Y0) & (icmp eq X1, Y1) -> icmp eq X01, Y01 /// (icmp ne X0, Y0) | (icmp ne X1, Y1) -> icmp ne X01, Y01 /// where X0, X1 and Y0, Y1 are adjacent parts extracted from an integer. -Value *InstCombinerImpl::foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, - bool IsAnd) { +Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) { if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) return nullptr; CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE; - auto GetMatchPart = [&](ICmpInst *Cmp, + auto GetMatchPart = [&](Value *CmpV, unsigned OpNo) -> std::optional { + assert(CmpV->getType()->isIntOrIntVectorTy(1) && "Must be bool"); + + Value *X, *Y; + // icmp ne (and x, 1), (and y, 1) <=> trunc (xor x, y) to i1 + // icmp eq (and x, 1), (and y, 1) <=> not (trunc (xor x, y) to i1) + if (Pred == CmpInst::ICMP_NE + ? match(CmpV, m_Trunc(m_Xor(m_Value(X), m_Value(Y)))) + : match(CmpV, m_Not(m_Trunc(m_Xor(m_Value(X), m_Value(Y)))))) + return {{OpNo == 0 ? X : Y, 0, 1}}; + + auto *Cmp = dyn_cast(CmpV); + if (!Cmp) + return std::nullopt; + if (Pred == Cmp->getPredicate()) return matchIntPart(Cmp->getOperand(OpNo)); @@ -1465,11 +1478,15 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // FCmp canonicalization ensures that (fcmp ord/uno X, X) and // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0). - if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP())) + if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP())) { // Ignore the constants because they are obviously not NANs: // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y) // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y) + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(LHS->getFastMathFlags() & + RHS->getFastMathFlags()); return Builder.CreateFCmp(PredL, LHS0, RHS0); + } } if (IsAnd && stripSignOnlyFPOps(LHS0) == stripSignOnlyFPOps(RHS0)) { @@ -2728,47 +2745,31 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false)) return replaceInstUsesWith(I, Res); - { - ICmpInst *LHS = dyn_cast(Op0); - ICmpInst *RHS = dyn_cast(Op1); - - // TODO: Base this on foldBooleanAndOr instead? - // TODO: Make this recursive; it's a little tricky because an arbitrary - // number of 'and' instructions might have to be created. - if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { - bool IsLogical = isa(Op1); - // LHS & (X && Y) --> (LHS && X) && Y - if (auto *Cmp = dyn_cast(X)) - if (Value *Res = - foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true, IsLogical)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalAnd(Res, Y) - : Builder.CreateAnd(Res, Y)); - // LHS & (X && Y) --> X && (LHS & Y) - if (auto *Cmp = dyn_cast(Y)) - if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ true, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalAnd(X, Res) - : Builder.CreateAnd(X, Res)); - } - if (RHS && match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { - bool IsLogical = isa(Op0); - // (X && Y) & RHS --> (X && RHS) && Y - if (auto *Cmp = dyn_cast(X)) - if (Value *Res = - foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true, IsLogical)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalAnd(Res, Y) - : Builder.CreateAnd(Res, Y)); - // (X && Y) & RHS --> X && (Y & RHS) - if (auto *Cmp = dyn_cast(Y)) - if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ true, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalAnd(X, Res) - : Builder.CreateAnd(X, Res)); - } + // TODO: Make this recursive; it's a little tricky because an arbitrary + // number of 'and' instructions might have to be created. + if (match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { + bool IsLogical = isa(Op1); + // Op0 & (X && Y) --> (Op0 && X) && Y + if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ true, IsLogical)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y) + : Builder.CreateAnd(Res, Y)); + // Op0 & (X && Y) --> X && (Op0 & Y) + if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ true, + /* IsLogical */ false)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res) + : Builder.CreateAnd(X, Res)); + } + if (match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { + bool IsLogical = isa(Op0); + // (X && Y) & Op1 --> (X && Op1) && Y + if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ true, IsLogical)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y) + : Builder.CreateAnd(Res, Y)); + // (X && Y) & Op1 --> X && (Y & Op1) + if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ true, + /* IsLogical */ false)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res) + : Builder.CreateAnd(X, Res)); } if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) @@ -3416,9 +3417,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return X; } - if (Value *X = foldEqOfParts(LHS, RHS, IsAnd)) - return X; - // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) // TODO: Remove this and below when foldLogOpOfMaskedICmps can handle undefs. @@ -3541,6 +3539,9 @@ Value *InstCombinerImpl::foldBooleanAndOr(Value *LHS, Value *RHS, if (Value *Res = foldLogicOfFCmps(LHSCmp, RHSCmp, IsAnd, IsLogical)) return Res; + if (Value *Res = foldEqOfParts(LHS, RHS, IsAnd)) + return Res; + return nullptr; } @@ -3829,48 +3830,31 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false)) return replaceInstUsesWith(I, Res); - { - ICmpInst *LHS = dyn_cast(Op0); - ICmpInst *RHS = dyn_cast(Op1); - - // TODO: Base this on foldBooleanAndOr instead? - // TODO: Make this recursive; it's a little tricky because an arbitrary - // number of 'or' instructions might have to be created. - Value *X, *Y; - if (LHS && match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { - bool IsLogical = isa(Op1); - // LHS | (X || Y) --> (LHS || X) || Y - if (auto *Cmp = dyn_cast(X)) - if (Value *Res = - foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false, IsLogical)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalOr(Res, Y) - : Builder.CreateOr(Res, Y)); - // LHS | (X || Y) --> X || (LHS | Y) - if (auto *Cmp = dyn_cast(Y)) - if (Value *Res = foldAndOrOfICmps(LHS, Cmp, I, /* IsAnd */ false, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalOr(X, Res) - : Builder.CreateOr(X, Res)); - } - if (RHS && match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { - bool IsLogical = isa(Op0); - // (X || Y) | RHS --> (X || RHS) || Y - if (auto *Cmp = dyn_cast(X)) - if (Value *Res = - foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false, IsLogical)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalOr(Res, Y) - : Builder.CreateOr(Res, Y)); - // (X || Y) | RHS --> X || (Y | RHS) - if (auto *Cmp = dyn_cast(Y)) - if (Value *Res = foldAndOrOfICmps(Cmp, RHS, I, /* IsAnd */ false, - /* IsLogical */ false)) - return replaceInstUsesWith(I, IsLogical - ? Builder.CreateLogicalOr(X, Res) - : Builder.CreateOr(X, Res)); - } + // TODO: Make this recursive; it's a little tricky because an arbitrary + // number of 'or' instructions might have to be created. + if (match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { + bool IsLogical = isa(Op1); + // Op0 | (X || Y) --> (Op0 || X) || Y + if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ false, IsLogical)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y) + : Builder.CreateOr(Res, Y)); + // Op0 | (X || Y) --> X || (Op0 | Y) + if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ false, + /* IsLogical */ false)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res) + : Builder.CreateOr(X, Res)); + } + if (match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) { + bool IsLogical = isa(Op0); + // (X || Y) | Op1 --> (X || Op1) || Y + if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ false, IsLogical)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y) + : Builder.CreateOr(Res, Y)); + // (X || Y) | Op1 --> X || (Y | Op1) + if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ false, + /* IsLogical */ false)) + return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res) + : Builder.CreateOr(X, Res)); } if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9588930d7658c..0508ed48fc19c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -412,7 +412,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final bool IsAnd, bool IsLogical = false); Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor); - Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd); + Value *foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd); Value *foldAndOrOfICmpsUsingRanges(ICmpInst *ICmp1, ICmpInst *ICmp2, bool IsAnd); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 1991ec82d1e1e..b664bde5d320a 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1662,21 +1662,43 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1, /// \endcode /// /// So we need to turn hoisted load/store into cload/cstore. +/// +/// \param BI The branch instruction. +/// \param SpeculatedConditionalLoadsStores The load/store instructions that +/// will be speculated. +/// \param Invert indicates if speculates FalseBB. Only used in triangle CFG. static void hoistConditionalLoadsStores( BranchInst *BI, SmallVectorImpl &SpeculatedConditionalLoadsStores, - bool Invert) { + std::optional Invert) { auto &Context = BI->getParent()->getContext(); auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1); auto *Cond = BI->getOperand(0); // Construct the condition if needed. BasicBlock *BB = BI->getParent(); - IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back()); - Value *Mask = Builder.CreateBitCast( - Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond, - VCondTy); + IRBuilder<> Builder( + Invert.has_value() ? SpeculatedConditionalLoadsStores.back() : BI); + Value *Mask = nullptr; + Value *MaskFalse = nullptr; + Value *MaskTrue = nullptr; + if (Invert.has_value()) { + Mask = Builder.CreateBitCast( + *Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond, + VCondTy); + } else { + MaskFalse = Builder.CreateBitCast( + Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy); + MaskTrue = Builder.CreateBitCast(Cond, VCondTy); + } + auto PeekThroughBitcasts = [](Value *V) { + while (auto *BitCast = dyn_cast(V)) + V = BitCast->getOperand(0); + return V; + }; for (auto *I : SpeculatedConditionalLoadsStores) { - IRBuilder<> Builder(I); + IRBuilder<> Builder(Invert.has_value() ? I : BI); + if (!Invert.has_value()) + Mask = I->getParent() == BI->getSuccessor(0) ? MaskTrue : MaskFalse; // We currently assume conditional faulting load/store is supported for // scalar types only when creating new instructions. This can be easily // extended for vector types in the future. @@ -1688,12 +1710,14 @@ static void hoistConditionalLoadsStores( auto *Ty = I->getType(); PHINode *PN = nullptr; Value *PassThru = nullptr; - for (User *U : I->users()) - if ((PN = dyn_cast(U))) { - PassThru = Builder.CreateBitCast(PN->getIncomingValueForBlock(BB), - FixedVectorType::get(Ty, 1)); - break; - } + if (Invert.has_value()) + for (User *U : I->users()) + if ((PN = dyn_cast(U))) { + PassThru = Builder.CreateBitCast( + PeekThroughBitcasts(PN->getIncomingValueForBlock(BB)), + FixedVectorType::get(Ty, 1)); + break; + } MaskedLoadStore = Builder.CreateMaskedLoad( FixedVectorType::get(Ty, 1), Op0, LI->getAlign(), Mask, PassThru); Value *NewLoadStore = Builder.CreateBitCast(MaskedLoadStore, Ty); @@ -1702,8 +1726,8 @@ static void hoistConditionalLoadsStores( I->replaceAllUsesWith(NewLoadStore); } else { // Handle Store. - auto *StoredVal = - Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1)); + auto *StoredVal = Builder.CreateBitCast( + PeekThroughBitcasts(Op0), FixedVectorType::get(Op0->getType(), 1)); MaskedLoadStore = Builder.CreateMaskedStore( StoredVal, I->getOperand(1), cast(I)->getAlign(), Mask); } @@ -3155,7 +3179,8 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB, return HaveRewritablePHIs; } -static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert, +static bool isProfitableToSpeculate(const BranchInst *BI, + std::optional Invert, const TargetTransformInfo &TTI) { // If the branch is non-unpredictable, and is predicted to *not* branch to // the `then` block, then avoid speculating it. @@ -3166,7 +3191,10 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert, if (!extractBranchWeights(*BI, TWeight, FWeight) || (TWeight + FWeight) == 0) return true; - uint64_t EndWeight = Invert ? TWeight : FWeight; + if (!Invert.has_value()) + return false; + + uint64_t EndWeight = *Invert ? TWeight : FWeight; BranchProbability BIEndProb = BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight); BranchProbability Likely = TTI.getPredictableBranchThreshold(); @@ -8034,6 +8062,35 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (HoistCommon && hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts)) return requestResimplify(); + + if (BI && HoistLoadsStoresWithCondFaulting && + Options.HoistLoadsStoresWithCondFaulting && + isProfitableToSpeculate(BI, std::nullopt, TTI)) { + SmallVector SpeculatedConditionalLoadsStores; + auto CanSpeculateConditionalLoadsStores = [&]() { + for (auto *Succ : successors(BB)) { + for (Instruction &I : *Succ) { + if (I.isTerminator()) { + if (I.getNumSuccessors() > 1) + return false; + continue; + } else if (!isSafeCheapLoadStore(&I, TTI) || + SpeculatedConditionalLoadsStores.size() == + HoistLoadsStoresWithCondFaultingThreshold) { + return false; + } + SpeculatedConditionalLoadsStores.push_back(&I); + } + } + return !SpeculatedConditionalLoadsStores.empty(); + }; + + if (CanSpeculateConditionalLoadsStores()) { + hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores, + std::nullopt); + return requestResimplify(); + } + } } else { // If Successor #1 has multiple preds, we may be able to conditionally // execute Successor #0 if it branches to Successor #1. diff --git a/llvm/test/Analysis/CostModel/RISCV/abs.ll b/llvm/test/Analysis/CostModel/RISCV/abs.ll index 8f0dd7b0aefe9..7252716af8605 100644 --- a/llvm/test/Analysis/CostModel/RISCV/abs.ll +++ b/llvm/test/Analysis/CostModel/RISCV/abs.ll @@ -44,37 +44,37 @@ declare @llvm.abs.nxv64i8(, i1) define i32 @abs(i32 %arg) { ; CHECK-LABEL: 'abs' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call @llvm.abs.nxv2i64( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call @llvm.abs.nxv4i64( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.abs.nxv8i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = call @llvm.abs.nxv2i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %5 = call @llvm.abs.nxv4i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = call @llvm.abs.nxv8i64( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %10 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call @llvm.abs.nxv2i32( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call @llvm.abs.nxv4i32( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call @llvm.abs.nxv8i32( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call @llvm.abs.nxv16i32( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call @llvm.abs.nxv4i32( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %13 = call @llvm.abs.nxv8i32( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %14 = call @llvm.abs.nxv16i32( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %18 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %19 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.abs.nxv2i16( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call @llvm.abs.nxv4i16( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = call @llvm.abs.nxv8i16( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call @llvm.abs.nxv16i16( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call @llvm.abs.nxv32i16( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %22 = call @llvm.abs.nxv8i16( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %23 = call @llvm.abs.nxv16i16( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %24 = call @llvm.abs.nxv32i16( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %27 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %28 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.abs.nxv8i8( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call @llvm.abs.nxv16i8( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call @llvm.abs.nxv32i8( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %32 = call @llvm.abs.nxv64i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call @llvm.abs.nxv16i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %31 = call @llvm.abs.nxv32i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %32 = call @llvm.abs.nxv64i8( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll index 800ea223850d3..c7cd845a0a03f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED define void @unsupported_fp_ops( %vec, i32 %extraarg) { ; CHECK-LABEL: 'unsupported_fp_ops' @@ -1147,28 +1147,28 @@ define void @abs() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.vp.abs.nxv2i8( undef, i1 false, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.abs.nxv2i8( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.vp.abs.nxv4i8( undef, i1 false, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.abs.nxv4i8( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call @llvm.vp.abs.nxv8i8( undef, i1 false, undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = call @llvm.abs.nxv8i8( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call @llvm.vp.abs.nxv16i8( undef, i1 false, undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call @llvm.abs.nxv16i8( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = call @llvm.vp.abs.nxv2i64( undef, i1 false, undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.abs.nxv2i64( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.vp.abs.nxv4i64( undef, i1 false, undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.abs.nxv4i64( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.vp.abs.nxv8i64( undef, i1 false, undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call @llvm.abs.nxv8i64( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call @llvm.vp.abs.nxv16i64( undef, i1 false, undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call @llvm.abs.nxv16i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %23 = call @llvm.vp.abs.nxv16i8( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %24 = call @llvm.abs.nxv16i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %25 = call @llvm.vp.abs.nxv2i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = call @llvm.abs.nxv2i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %27 = call @llvm.vp.abs.nxv4i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %28 = call @llvm.abs.nxv4i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %29 = call @llvm.vp.abs.nxv8i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %30 = call @llvm.abs.nxv8i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %31 = call @llvm.vp.abs.nxv16i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %32 = call @llvm.abs.nxv16i64( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'abs' @@ -1182,28 +1182,28 @@ define void @abs() { ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.vp.abs.nxv2i8( undef, i1 false, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.abs.nxv2i8( undef, i1 false) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.vp.abs.nxv4i8( undef, i1 false, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.abs.nxv4i8( undef, i1 false) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call @llvm.vp.abs.nxv8i8( undef, i1 false, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = call @llvm.abs.nxv8i8( undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call @llvm.vp.abs.nxv16i8( undef, i1 false, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call @llvm.abs.nxv16i8( undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = call @llvm.vp.abs.nxv2i64( undef, i1 false, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.abs.nxv2i64( undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.vp.abs.nxv4i64( undef, i1 false, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.abs.nxv4i64( undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.vp.abs.nxv8i64( undef, i1 false, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call @llvm.abs.nxv8i64( undef, i1 false) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call @llvm.vp.abs.nxv16i64( undef, i1 false, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call @llvm.abs.nxv16i64( undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %23 = call @llvm.vp.abs.nxv16i8( undef, i1 false, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %24 = call @llvm.abs.nxv16i8( undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %25 = call @llvm.vp.abs.nxv2i64( undef, i1 false, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = call @llvm.abs.nxv2i64( undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %27 = call @llvm.vp.abs.nxv4i64( undef, i1 false, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %28 = call @llvm.abs.nxv4i64( undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %29 = call @llvm.vp.abs.nxv8i64( undef, i1 false, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %30 = call @llvm.abs.nxv8i64( undef, i1 false) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %31 = call @llvm.vp.abs.nxv16i64( undef, i1 false, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %32 = call @llvm.abs.nxv16i64( undef, i1 false) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef) @@ -2125,6 +2125,232 @@ define void @vp_fdiv(){ ret void } +define void @splat() { +; CHECK-LABEL: 'splat' +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %1 = call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %2 = call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %3 = call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %4 = call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %25 = call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %37 = call @llvm.experimental.vp.splat.nxv2i1(i1 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %38 = call @llvm.experimental.vp.splat.nxv4i1(i1 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %39 = call @llvm.experimental.vp.splat.nxv8i1(i1 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %40 = call @llvm.experimental.vp.splat.nxv16i1(i1 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %41 = call @llvm.experimental.vp.splat.nxv2i8(i8 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %42 = call @llvm.experimental.vp.splat.nxv4i8(i8 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %43 = call @llvm.experimental.vp.splat.nxv8i8(i8 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %44 = call @llvm.experimental.vp.splat.nxv16i8(i8 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %45 = call @llvm.experimental.vp.splat.nxv2i16(i16 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %46 = call @llvm.experimental.vp.splat.nxv4i16(i16 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %47 = call @llvm.experimental.vp.splat.nxv8i16(i16 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %48 = call @llvm.experimental.vp.splat.nxv16i16(i16 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %49 = call @llvm.experimental.vp.splat.nxv2i32(i32 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %50 = call @llvm.experimental.vp.splat.nxv4i32(i32 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %51 = call @llvm.experimental.vp.splat.nxv8i32(i32 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %52 = call @llvm.experimental.vp.splat.nxv16i32(i32 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %53 = call @llvm.experimental.vp.splat.nxv2i64(i64 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %54 = call @llvm.experimental.vp.splat.nxv4i64(i64 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %55 = call @llvm.experimental.vp.splat.nxv8i64(i64 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %56 = call @llvm.experimental.vp.splat.nxv16i64(i64 undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %57 = call @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %58 = call @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %59 = call @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %60 = call @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = call @llvm.experimental.vp.splat.nxv2f16(half undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %62 = call @llvm.experimental.vp.splat.nxv4f16(half undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %63 = call @llvm.experimental.vp.splat.nxv8f16(half undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %64 = call @llvm.experimental.vp.splat.nxv16f16(half undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %65 = call @llvm.experimental.vp.splat.nxv2f32(float undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %66 = call @llvm.experimental.vp.splat.nxv4f32(float undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %67 = call @llvm.experimental.vp.splat.nxv8f32(float undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %68 = call @llvm.experimental.vp.splat.nxv16f32(float undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %69 = call @llvm.experimental.vp.splat.nxv2f64(double undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %70 = call @llvm.experimental.vp.splat.nxv4f64(double undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %71 = call @llvm.experimental.vp.splat.nxv8f64(double undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %72 = call @llvm.experimental.vp.splat.nxv16f64(double undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; TYPEBASED-LABEL: 'splat' +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %1 = call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %2 = call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %3 = call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %4 = call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %19 = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %20 = call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %25 = call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %37 = call @llvm.experimental.vp.splat.nxv2i1(i1 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %38 = call @llvm.experimental.vp.splat.nxv4i1(i1 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %39 = call @llvm.experimental.vp.splat.nxv8i1(i1 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %40 = call @llvm.experimental.vp.splat.nxv16i1(i1 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %41 = call @llvm.experimental.vp.splat.nxv2i8(i8 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %42 = call @llvm.experimental.vp.splat.nxv4i8(i8 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %43 = call @llvm.experimental.vp.splat.nxv8i8(i8 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %44 = call @llvm.experimental.vp.splat.nxv16i8(i8 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %45 = call @llvm.experimental.vp.splat.nxv2i16(i16 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %46 = call @llvm.experimental.vp.splat.nxv4i16(i16 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %47 = call @llvm.experimental.vp.splat.nxv8i16(i16 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %48 = call @llvm.experimental.vp.splat.nxv16i16(i16 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %49 = call @llvm.experimental.vp.splat.nxv2i32(i32 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %50 = call @llvm.experimental.vp.splat.nxv4i32(i32 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %51 = call @llvm.experimental.vp.splat.nxv8i32(i32 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %52 = call @llvm.experimental.vp.splat.nxv16i32(i32 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %53 = call @llvm.experimental.vp.splat.nxv2i64(i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %54 = call @llvm.experimental.vp.splat.nxv4i64(i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %55 = call @llvm.experimental.vp.splat.nxv8i64(i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %56 = call @llvm.experimental.vp.splat.nxv16i64(i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %57 = call @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %58 = call @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %59 = call @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %60 = call @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = call @llvm.experimental.vp.splat.nxv2f16(half undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %62 = call @llvm.experimental.vp.splat.nxv4f16(half undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %63 = call @llvm.experimental.vp.splat.nxv8f16(half undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %64 = call @llvm.experimental.vp.splat.nxv16f16(half undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %65 = call @llvm.experimental.vp.splat.nxv2f32(float undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %66 = call @llvm.experimental.vp.splat.nxv4f32(float undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %67 = call @llvm.experimental.vp.splat.nxv8f32(float undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %68 = call @llvm.experimental.vp.splat.nxv16f32(float undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %69 = call @llvm.experimental.vp.splat.nxv2f64(double undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %70 = call @llvm.experimental.vp.splat.nxv4f64(double undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %71 = call @llvm.experimental.vp.splat.nxv8f64(double undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %72 = call @llvm.experimental.vp.splat.nxv16f64(double undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call <2 x i1> @llvm.experimental.vp.splat.v2i1(i1 undef, <2 x i1> undef, i32 undef) + call <4 x i1> @llvm.experimental.vp.splat.v4i1(i1 undef, <4 x i1> undef, i32 undef) + call <8 x i1> @llvm.experimental.vp.splat.v8i1(i1 undef, <8 x i1> undef, i32 undef) + call <16 x i1> @llvm.experimental.vp.splat.v16i1(i1 undef, <16 x i1> undef, i32 undef) + call <2 x i8> @llvm.experimental.vp.splat.v2i8(i8 undef, <2 x i1> undef, i32 undef) + call <4 x i8> @llvm.experimental.vp.splat.v4i8(i8 undef, <4 x i1> undef, i32 undef) + call <8 x i8> @llvm.experimental.vp.splat.v8i8(i8 undef, <8 x i1> undef, i32 undef) + call <16 x i8> @llvm.experimental.vp.splat.v16i8(i8 undef, <16 x i1> undef, i32 undef) + call <2 x i16> @llvm.experimental.vp.splat.v2i16(i16 undef, <2 x i1> undef, i32 undef) + call <4 x i16> @llvm.experimental.vp.splat.v4i16(i16 undef, <4 x i1> undef, i32 undef) + call <8 x i16> @llvm.experimental.vp.splat.v8i16(i16 undef, <8 x i1> undef, i32 undef) + call <16 x i16> @llvm.experimental.vp.splat.v16i16(i16 undef, <16 x i1> undef, i32 undef) + call <2 x i32> @llvm.experimental.vp.splat.v2i32(i32 undef, <2 x i1> undef, i32 undef) + call <4 x i32> @llvm.experimental.vp.splat.v4i32(i32 undef, <4 x i1> undef, i32 undef) + call <8 x i32> @llvm.experimental.vp.splat.v8i32(i32 undef, <8 x i1> undef, i32 undef) + call <16 x i32> @llvm.experimental.vp.splat.v16i32(i32 undef, <16 x i1> undef, i32 undef) + call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 undef, <2 x i1> undef, i32 undef) + call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 undef, <4 x i1> undef, i32 undef) + call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 undef, <8 x i1> undef, i32 undef) + call <16 x i64> @llvm.experimental.vp.splat.v16i64(i64 undef, <16 x i1> undef, i32 undef) + call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat undef, <2 x i1> undef, i32 undef) + call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat undef, <4 x i1> undef, i32 undef) + call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat undef, <8 x i1> undef, i32 undef) + call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat undef, <16 x i1> undef, i32 undef) + call <2 x half> @llvm.experimental.vp.splat.v2f16(half undef, <2 x i1> undef, i32 undef) + call <4 x half> @llvm.experimental.vp.splat.v4f16(half undef, <4 x i1> undef, i32 undef) + call <8 x half> @llvm.experimental.vp.splat.v8f16(half undef, <8 x i1> undef, i32 undef) + call <16 x half> @llvm.experimental.vp.splat.v16f16(half undef, <16 x i1> undef, i32 undef) + call <2 x float> @llvm.experimental.vp.splat.v2f32(float undef, <2 x i1> undef, i32 undef) + call <4 x float> @llvm.experimental.vp.splat.v4f32(float undef, <4 x i1> undef, i32 undef) + call <8 x float> @llvm.experimental.vp.splat.v8f32(float undef, <8 x i1> undef, i32 undef) + call <16 x float> @llvm.experimental.vp.splat.v16f32(float undef, <16 x i1> undef, i32 undef) + call <2 x double> @llvm.experimental.vp.splat.v2f64(double undef, <2 x i1> undef, i32 undef) + call <4 x double> @llvm.experimental.vp.splat.v4f64(double undef, <4 x i1> undef, i32 undef) + call <8 x double> @llvm.experimental.vp.splat.v8f64(double undef, <8 x i1> undef, i32 undef) + call <16 x double> @llvm.experimental.vp.splat.v16f64(double undef, <16 x i1> undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2i1(i1 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4i1(i1 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8i1(i1 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16i1(i1 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2i8(i8 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4i8(i8 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8i8(i8 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16i8(i8 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2i16(i16 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4i16(i16 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8i16(i16 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16i16(i16 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2i32(i32 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4i32(i32 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8i32(i32 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16i32(i32 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2i64(i64 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4i64(i64 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8i64(i64 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16i64(i64 undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2bf16(bfloat undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4bf16(bfloat undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8bf16(bfloat undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16bf16(bfloat undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2f16(half undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4f16(half undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8f16(half undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16f16(half undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2f32(float undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4f32(float undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8f32(float undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16f32(float undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv2f64(double undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv4f64(double undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv8f64(double undef, undef, i32 undef) + call @llvm.experimental.vp.splat.nxv16f64(double undef, undef, i32 undef) + ret void +} + declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index 7af7c235f9ac1..2f543cc324bc2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -1,12 +1,114 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for sqshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshl1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshl1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshl_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshl_scalar_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshr1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for urshr_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshr1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srshr_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu1d_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_i64_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_i32_constant +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrn4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshrun4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrn4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqrshrun4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqrshrn4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn1s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uqshrn4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for neon_ushl_vscalar_constant_shift +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for neon_ushl_scalar_constant_shift +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for neon_sshll_vscalar_constant_shift +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift_m1 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ursra1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ursra_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srsra1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for srsra_scalar +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli8b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli4h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli2s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli16b +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli8h +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli4s +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli2d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_zero_shift_amount +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for lshr_trunc_v2i64_v2i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ashr_trunc_v2i64_v2i8 define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: sqshl8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqshl.8b v0, v0, v1 +; CHECK-NEXT: sqshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -19,7 +121,7 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqshl.4h v0, v0, v1 +; CHECK-NEXT: sqshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -32,7 +134,7 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqshl.2s v0, v0, v1 +; CHECK-NEXT: sqshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -97,7 +199,7 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqshl.8b v0, v0, v1 +; CHECK-NEXT: uqshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -110,7 +212,7 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqshl.4h v0, v0, v1 +; CHECK-NEXT: uqshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -123,7 +225,7 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqshl.2s v0, v0, v1 +; CHECK-NEXT: uqshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -136,7 +238,7 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshl.16b v0, v0, v1 +; CHECK-NEXT: sqshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -149,7 +251,7 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshl.8h v0, v0, v1 +; CHECK-NEXT: sqshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -162,7 +264,7 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshl.4s v0, v0, v1 +; CHECK-NEXT: sqshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -175,7 +277,7 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshl.2d v0, v0, v1 +; CHECK-NEXT: sqshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -188,7 +290,7 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshl.16b v0, v0, v1 +; CHECK-NEXT: uqshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -201,7 +303,7 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshl.8h v0, v0, v1 +; CHECK-NEXT: uqshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -214,7 +316,7 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshl.4s v0, v0, v1 +; CHECK-NEXT: uqshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -227,7 +329,7 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshl.2d v0, v0, v1 +; CHECK-NEXT: uqshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -315,7 +417,7 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: srshl.8b v0, v0, v1 +; CHECK-NEXT: srshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -328,7 +430,7 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: srshl.4h v0, v0, v1 +; CHECK-NEXT: srshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -341,7 +443,7 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: srshl.2s v0, v0, v1 +; CHECK-NEXT: srshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -394,10 +496,10 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind { define i64 @srshl_scalar_constant(ptr %A) nounwind { ; CHECK-LABEL: srshl_scalar_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: srshl d0, d0, d1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -411,7 +513,7 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: urshl.8b v0, v0, v1 +; CHECK-NEXT: urshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -424,7 +526,7 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: urshl.4h v0, v0, v1 +; CHECK-NEXT: urshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -437,7 +539,7 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: urshl.2s v0, v0, v1 +; CHECK-NEXT: urshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -490,10 +592,10 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind { define i64 @urshl_scalar_constant(ptr %A) nounwind { ; CHECK-LABEL: urshl_scalar_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: urshl d0, d0, d1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -507,7 +609,7 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: srshl.16b v0, v0, v1 +; CHECK-NEXT: srshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -520,7 +622,7 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: srshl.8h v0, v0, v1 +; CHECK-NEXT: srshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -533,7 +635,7 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: srshl.4s v0, v0, v1 +; CHECK-NEXT: srshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -546,7 +648,7 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: srshl.2d v0, v0, v1 +; CHECK-NEXT: srshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -559,7 +661,7 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: urshl.16b v0, v0, v1 +; CHECK-NEXT: urshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -572,7 +674,7 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: urshl.8h v0, v0, v1 +; CHECK-NEXT: urshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -585,7 +687,7 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: urshl.4s v0, v0, v1 +; CHECK-NEXT: urshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -598,7 +700,7 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: urshl.2d v0, v0, v1 +; CHECK-NEXT: urshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -633,7 +735,7 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrshl.8b v0, v0, v1 +; CHECK-NEXT: sqrshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -646,7 +748,7 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrshl.4h v0, v0, v1 +; CHECK-NEXT: sqrshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -659,7 +761,7 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrshl.2s v0, v0, v1 +; CHECK-NEXT: sqrshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -672,7 +774,7 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqrshl.8b v0, v0, v1 +; CHECK-NEXT: uqrshl v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -685,7 +787,7 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqrshl.4h v0, v0, v1 +; CHECK-NEXT: uqrshl v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -698,7 +800,7 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uqrshl.2s v0, v0, v1 +; CHECK-NEXT: uqrshl v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -711,7 +813,7 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshl.16b v0, v0, v1 +; CHECK-NEXT: sqrshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -724,7 +826,7 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshl.8h v0, v0, v1 +; CHECK-NEXT: sqrshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -737,7 +839,7 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshl.4s v0, v0, v1 +; CHECK-NEXT: sqrshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -750,7 +852,7 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshl.2d v0, v0, v1 +; CHECK-NEXT: sqrshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -803,10 +905,10 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind { define i64 @sqrshl_scalar_constant(ptr %A) nounwind { ; CHECK-LABEL: sqrshl_scalar_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: sqrshl d0, d0, d1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -820,7 +922,7 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshl.16b v0, v0, v1 +; CHECK-NEXT: uqrshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -833,7 +935,7 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshl.8h v0, v0, v1 +; CHECK-NEXT: uqrshl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -846,7 +948,7 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshl.4s v0, v0, v1 +; CHECK-NEXT: uqrshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -859,7 +961,7 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshl.2d v0, v0, v1 +; CHECK-NEXT: uqrshl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -912,10 +1014,10 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind { define i64 @uqrshl_scalar_constant(ptr %A) nounwind { ; CHECK-LABEL: uqrshl_scalar_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: uqrshl d0, d0, d1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -947,77 +1049,126 @@ declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone define <8 x i8> @urshr8b(ptr %A) nounwind { -; CHECK-LABEL: urshr8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: urshr.8b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: urshr v0.8b, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) ret <8 x i8> %tmp3 } define <4 x i16> @urshr4h(ptr %A) nounwind { -; CHECK-LABEL: urshr4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: urshr.4h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: urshr v0.4h, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) ret <4 x i16> %tmp3 } define <2 x i32> @urshr2s(ptr %A) nounwind { -; CHECK-LABEL: urshr2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: urshr.2s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: urshr v0.2s, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) ret <2 x i32> %tmp3 } define <16 x i8> @urshr16b(ptr %A) nounwind { -; CHECK-LABEL: urshr16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: urshr.16b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: urshr v0.16b, v0.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp3 } define <8 x i16> @urshr8h(ptr %A) nounwind { -; CHECK-LABEL: urshr8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: urshr.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: urshr v0.8h, v0.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) ret <8 x i16> %tmp3 } define <4 x i32> @urshr4s(ptr %A) nounwind { -; CHECK-LABEL: urshr4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: urshr.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: urshr v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) ret <4 x i32> %tmp3 } define <2 x i64> @urshr2d(ptr %A) nounwind { -; CHECK-LABEL: urshr2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: urshr.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshr2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: urshr v0.2d, v0.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshr2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) ret <2 x i64> %tmp3 @@ -1047,77 +1198,126 @@ define i64 @urshr_scalar(ptr %A) nounwind { } define <8 x i8> @srshr8b(ptr %A) nounwind { -; CHECK-LABEL: srshr8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: srshr.8b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: srshr v0.8b, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) ret <8 x i8> %tmp3 } define <4 x i16> @srshr4h(ptr %A) nounwind { -; CHECK-LABEL: srshr4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: srshr.4h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: srshr v0.4h, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) ret <4 x i16> %tmp3 } define <2 x i32> @srshr2s(ptr %A) nounwind { -; CHECK-LABEL: srshr2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: srshr.2s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: srshr v0.2s, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) ret <2 x i32> %tmp3 } define <16 x i8> @srshr16b(ptr %A) nounwind { -; CHECK-LABEL: srshr16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: srshr.16b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: srshr v0.16b, v0.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp3 } define <8 x i16> @srshr8h(ptr %A) nounwind { -; CHECK-LABEL: srshr8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: srshr.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: srshr v0.8h, v0.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) ret <8 x i16> %tmp3 } define <4 x i32> @srshr4s(ptr %A) nounwind { -; CHECK-LABEL: srshr4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: srshr.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: srshr v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) ret <4 x i32> %tmp3 } define <2 x i64> @srshr2d(ptr %A) nounwind { -; CHECK-LABEL: srshr2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: srshr.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshr2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: srshr v0.2d, v0.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshr2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) ret <2 x i64> %tmp3 @@ -1150,7 +1350,7 @@ define <8 x i8> @sqshlu8b(ptr %A) nounwind { ; CHECK-LABEL: sqshlu8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshlu.8b v0, v0, #1 +; CHECK-NEXT: sqshlu v0.8b, v0.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> ) @@ -1161,7 +1361,7 @@ define <4 x i16> @sqshlu4h(ptr %A) nounwind { ; CHECK-LABEL: sqshlu4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshlu.4h v0, v0, #1 +; CHECK-NEXT: sqshlu v0.4h, v0.4h, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> ) @@ -1172,7 +1372,7 @@ define <2 x i32> @sqshlu2s(ptr %A) nounwind { ; CHECK-LABEL: sqshlu2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshlu.2s v0, v0, #1 +; CHECK-NEXT: sqshlu v0.2s, v0.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> ) @@ -1183,7 +1383,7 @@ define <16 x i8> @sqshlu16b(ptr %A) nounwind { ; CHECK-LABEL: sqshlu16b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshlu.16b v0, v0, #1 +; CHECK-NEXT: sqshlu v0.16b, v0.16b, #1 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> ) @@ -1194,7 +1394,7 @@ define <8 x i16> @sqshlu8h(ptr %A) nounwind { ; CHECK-LABEL: sqshlu8h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshlu.8h v0, v0, #1 +; CHECK-NEXT: sqshlu v0.8h, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> ) @@ -1205,7 +1405,7 @@ define <4 x i32> @sqshlu4s(ptr %A) nounwind { ; CHECK-LABEL: sqshlu4s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshlu.4s v0, v0, #1 +; CHECK-NEXT: sqshlu v0.4s, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> ) @@ -1216,7 +1416,7 @@ define <2 x i64> @sqshlu2d(ptr %A) nounwind { ; CHECK-LABEL: sqshlu2d: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshlu.2d v0, v0, #1 +; CHECK-NEXT: sqshlu v0.2d, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> ) @@ -1275,7 +1475,7 @@ define <8 x i8> @rshrn8b(ptr %A) nounwind { ; CHECK-LABEL: rshrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: rshrn.8b v0, v0, #1 +; CHECK-NEXT: rshrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1) @@ -1286,7 +1486,7 @@ define <4 x i16> @rshrn4h(ptr %A) nounwind { ; CHECK-LABEL: rshrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: rshrn.4h v0, v0, #1 +; CHECK-NEXT: rshrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1) @@ -1297,7 +1497,7 @@ define <2 x i32> @rshrn2s(ptr %A) nounwind { ; CHECK-LABEL: rshrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: rshrn.2s v0, v0, #1 +; CHECK-NEXT: rshrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1) @@ -1309,7 +1509,7 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: rshrn2.16b v0, v1, #1 +; CHECK-NEXT: rshrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1323,7 +1523,7 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: rshrn2.8h v0, v1, #1 +; CHECK-NEXT: rshrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1337,7 +1537,7 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: rshrn2.4s v0, v1, #1 +; CHECK-NEXT: rshrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1354,7 +1554,7 @@ define <8 x i8> @shrn8b(ptr %A) nounwind { ; CHECK-LABEL: shrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.8b v0, v0, #1 +; CHECK-NEXT: shrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = lshr <8 x i16> %tmp1, @@ -1366,7 +1566,7 @@ define <4 x i16> @shrn4h(ptr %A) nounwind { ; CHECK-LABEL: shrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #1 +; CHECK-NEXT: shrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = lshr <4 x i32> %tmp1, @@ -1378,7 +1578,7 @@ define <2 x i32> @shrn2s(ptr %A) nounwind { ; CHECK-LABEL: shrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: shrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = lshr <2 x i64> %tmp1, @@ -1391,7 +1591,7 @@ define <16 x i8> @shrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shrn2.16b v0, v1, #1 +; CHECK-NEXT: shrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1406,7 +1606,7 @@ define <8 x i16> @shrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shrn2.8h v0, v1, #1 +; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1421,7 +1621,7 @@ define <4 x i32> @shrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: shrn2.4s v0, v1, #1 +; CHECK-NEXT: shrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1450,7 +1650,7 @@ define <8 x i8> @sqshrn8b(ptr %A) nounwind { ; CHECK-LABEL: sqshrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrn.8b v0, v0, #1 +; CHECK-NEXT: sqshrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1) @@ -1461,7 +1661,7 @@ define <4 x i16> @sqshrn4h(ptr %A) nounwind { ; CHECK-LABEL: sqshrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrn.4h v0, v0, #1 +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1) @@ -1472,7 +1672,7 @@ define <2 x i32> @sqshrn2s(ptr %A) nounwind { ; CHECK-LABEL: sqshrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrn.2s v0, v0, #1 +; CHECK-NEXT: sqshrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1) @@ -1485,7 +1685,7 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrn2.16b v0, v1, #1 +; CHECK-NEXT: sqshrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1499,7 +1699,7 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrn2.8h v0, v1, #1 +; CHECK-NEXT: sqshrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1513,7 +1713,7 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrn2.4s v0, v1, #1 +; CHECK-NEXT: sqshrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1542,7 +1742,7 @@ define <8 x i8> @sqshrun8b(ptr %A) nounwind { ; CHECK-LABEL: sqshrun8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrun.8b v0, v0, #1 +; CHECK-NEXT: sqshrun v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1) @@ -1553,7 +1753,7 @@ define <4 x i16> @sqshrun4h(ptr %A) nounwind { ; CHECK-LABEL: sqshrun4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrun.4h v0, v0, #1 +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1) @@ -1564,7 +1764,7 @@ define <2 x i32> @sqshrun2s(ptr %A) nounwind { ; CHECK-LABEL: sqshrun2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshrun.2s v0, v0, #1 +; CHECK-NEXT: sqshrun v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1) @@ -1576,7 +1776,7 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrun2.16b v0, v1, #1 +; CHECK-NEXT: sqshrun2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1590,7 +1790,7 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrun2.8h v0, v1, #1 +; CHECK-NEXT: sqshrun2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1604,7 +1804,7 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqshrun2.4s v0, v1, #1 +; CHECK-NEXT: sqshrun2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1633,7 +1833,7 @@ define <8 x i8> @sqrshrn8b(ptr %A) nounwind { ; CHECK-LABEL: sqrshrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrn.8b v0, v0, #1 +; CHECK-NEXT: sqrshrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1) @@ -1644,7 +1844,7 @@ define <4 x i16> @sqrshrn4h(ptr %A) nounwind { ; CHECK-LABEL: sqrshrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrn.4h v0, v0, #1 +; CHECK-NEXT: sqrshrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1) @@ -1655,7 +1855,7 @@ define <2 x i32> @sqrshrn2s(ptr %A) nounwind { ; CHECK-LABEL: sqrshrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrn.2s v0, v0, #1 +; CHECK-NEXT: sqrshrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1) @@ -1667,7 +1867,7 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrn2.16b v0, v1, #1 +; CHECK-NEXT: sqrshrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1681,7 +1881,7 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrn2.8h v0, v1, #1 +; CHECK-NEXT: sqrshrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1695,7 +1895,7 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrn2.4s v0, v1, #1 +; CHECK-NEXT: sqrshrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1724,7 +1924,7 @@ define <8 x i8> @sqrshrun8b(ptr %A) nounwind { ; CHECK-LABEL: sqrshrun8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrun.8b v0, v0, #1 +; CHECK-NEXT: sqrshrun v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1) @@ -1735,7 +1935,7 @@ define <4 x i16> @sqrshrun4h(ptr %A) nounwind { ; CHECK-LABEL: sqrshrun4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrun.4h v0, v0, #1 +; CHECK-NEXT: sqrshrun v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1) @@ -1746,7 +1946,7 @@ define <2 x i32> @sqrshrun2s(ptr %A) nounwind { ; CHECK-LABEL: sqrshrun2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqrshrun.2s v0, v0, #1 +; CHECK-NEXT: sqrshrun v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1) @@ -1758,7 +1958,7 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrun2.16b v0, v1, #1 +; CHECK-NEXT: sqrshrun2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1772,7 +1972,7 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrun2.8h v0, v1, #1 +; CHECK-NEXT: sqrshrun2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1786,7 +1986,7 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrshrun2.4s v0, v1, #1 +; CHECK-NEXT: sqrshrun2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1815,7 +2015,7 @@ define <8 x i8> @uqrshrn8b(ptr %A) nounwind { ; CHECK-LABEL: uqrshrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqrshrn.8b v0, v0, #1 +; CHECK-NEXT: uqrshrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1) @@ -1826,7 +2026,7 @@ define <4 x i16> @uqrshrn4h(ptr %A) nounwind { ; CHECK-LABEL: uqrshrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqrshrn.4h v0, v0, #1 +; CHECK-NEXT: uqrshrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1) @@ -1837,7 +2037,7 @@ define <2 x i32> @uqrshrn2s(ptr %A) nounwind { ; CHECK-LABEL: uqrshrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqrshrn.2s v0, v0, #1 +; CHECK-NEXT: uqrshrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1) @@ -1849,7 +2049,7 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshrn2.16b v0, v1, #1 +; CHECK-NEXT: uqrshrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1863,7 +2063,7 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshrn2.8h v0, v1, #1 +; CHECK-NEXT: uqrshrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1877,7 +2077,7 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqrshrn2.4s v0, v1, #1 +; CHECK-NEXT: uqrshrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1906,7 +2106,7 @@ define <8 x i8> @uqshrn8b(ptr %A) nounwind { ; CHECK-LABEL: uqshrn8b: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshrn.8b v0, v0, #1 +; CHECK-NEXT: uqshrn v0.8b, v0.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1) @@ -1917,7 +2117,7 @@ define <4 x i16> @uqshrn4h(ptr %A) nounwind { ; CHECK-LABEL: uqshrn4h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshrn.4h v0, v0, #1 +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1) @@ -1928,7 +2128,7 @@ define <2 x i32> @uqshrn2s(ptr %A) nounwind { ; CHECK-LABEL: uqshrn2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshrn.2s v0, v0, #1 +; CHECK-NEXT: uqshrn v0.2s, v0.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1) @@ -1940,7 +2140,7 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshrn2.16b v0, v1, #1 +; CHECK-NEXT: uqshrn2 v0.16b, v1.8h, #1 ; CHECK-NEXT: ret %out = load <8 x i8>, ptr %ret %tmp1 = load <8 x i16>, ptr %A @@ -1954,7 +2154,7 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshrn2.8h v0, v1, #1 +; CHECK-NEXT: uqshrn2 v0.8h, v1.4s, #1 ; CHECK-NEXT: ret %out = load <4 x i16>, ptr %ret %tmp1 = load <4 x i32>, ptr %A @@ -1968,7 +2168,7 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uqshrn2.4s v0, v1, #1 +; CHECK-NEXT: uqshrn2 v0.4s, v1.2d, #1 ; CHECK-NEXT: ret %out = load <2 x i32>, ptr %ret %tmp1 = load <2 x i64>, ptr %A @@ -1986,7 +2186,7 @@ define <8 x i16> @ushll8h(ptr %A) nounwind { ; CHECK-LABEL: ushll8h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.8h v0, v0, #1 +; CHECK-NEXT: ushll v0.8h, v0.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> @@ -1998,7 +2198,7 @@ define <4 x i32> @ushll4s(ptr %A) nounwind { ; CHECK-LABEL: ushll4s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.4s v0, v0, #1 +; CHECK-NEXT: ushll v0.4s, v0.4h, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> @@ -2010,7 +2210,7 @@ define <2 x i64> @ushll2d(ptr %A) nounwind { ; CHECK-LABEL: ushll2d: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.2d v0, v0, #1 +; CHECK-NEXT: ushll v0.2d, v0.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> @@ -2019,11 +2219,18 @@ define <2 x i64> @ushll2d(ptr %A) nounwind { } define <8 x i16> @ushll2_8h(ptr %A) nounwind { -; CHECK-LABEL: ushll2_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ushll.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ushll2_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ushll2_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #1 +; CHECK-GI-NEXT: ret %load1 = load <16 x i8>, ptr %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> @@ -2032,11 +2239,18 @@ define <8 x i16> @ushll2_8h(ptr %A) nounwind { } define <4 x i32> @ushll2_4s(ptr %A) nounwind { -; CHECK-LABEL: ushll2_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ushll.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ushll2_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ushll2_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #1 +; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> @@ -2045,11 +2259,18 @@ define <4 x i32> @ushll2_4s(ptr %A) nounwind { } define <2 x i64> @ushll2_2d(ptr %A) nounwind { -; CHECK-LABEL: ushll2_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ushll.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ushll2_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ushll2_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> @@ -2064,24 +2285,32 @@ declare <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64>, <2 x i64>) declare <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64>, <1 x i64>) declare i64 @llvm.aarch64.neon.ushl.i64(i64, i64) -define <8 x i16> @neon.ushll8h_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushll8h_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.8h v0, v0, #1 -; CHECK-NEXT: ret +define <8 x i16> @neon_ushll8h_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushll8h_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushll8h_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.8h, #1 +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> ) ret <8 x i16> %tmp3 } -define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushl8h_no_constant_shift: +define <8 x i16> @neon_ushl8h_no_constant_shift(ptr %A) nounwind { +; CHECK-LABEL: neon_ushl8h_no_constant_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushl.8h v0, v0, v0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushl v0.8h, v0.8h, v0.8h ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> @@ -2089,36 +2318,76 @@ define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind { ret <8 x i16> %tmp3 } -define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind { -; CHECK-LABEL: neon.ushl8h_constant_shift_extend_not_2x: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushl8h_constant_shift_extend_not_2x: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushl8h_constant_shift_extend_not_2x: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: uxtb w8, w8 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v1.b[3] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w10, s3 +; CHECK-GI-NEXT: fmov w11, s4 +; CHECK-GI-NEXT: uxtb w9, w9 +; CHECK-GI-NEXT: uxtb w10, w10 +; CHECK-GI-NEXT: uxtb w11, w11 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v1.h[1], w10 +; CHECK-GI-NEXT: mov v2.h[1], w11 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i8>, ptr %A %tmp2 = zext <4 x i8> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) ret <4 x i32> %tmp3 } -define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushl8_noext_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add.8h v0, v0, v0 -; CHECK-NEXT: ret +define <8 x i16> @neon_ushl8_noext_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushl8_noext_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: add v0.8h, v0.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushl8_noext_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.8h, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ushl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> ) ret <8 x i16> %tmp3 } -define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushll4s_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_ushll4s_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushll4s_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushll4s_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.4s, #1 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) @@ -2126,13 +2395,21 @@ define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind { } ; FIXME: unnecessary ushll.4s v0, v0, #0? -define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushll4s_neg_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushr.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_ushll4s_neg_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushll4s_neg_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushll4s_neg_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) @@ -2140,35 +2417,52 @@ define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind { } ; FIXME: should be constant folded. -define <4 x i32> @neon.ushll4s_constant_fold() nounwind { -; CHECK-LABEL: neon.ushll4s_constant_fold: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI160_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI160_0] -; CHECK-NEXT: add.4s v0, v0, v0 -; CHECK-NEXT: ret +define <4 x i32> @neon_ushll4s_constant_fold() nounwind { +; CHECK-SD-LABEL: neon_ushll4s_constant_fold: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI160_0 +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI160_0] +; CHECK-SD-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushll4s_constant_fold: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: adrp x8, .LCPI160_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI160_0] +; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> , <4 x i32> ) ret <4 x i32> %tmp3 } -define <2 x i64> @neon.ushll2d_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushll2d_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll.2d v0, v0, #1 -; CHECK-NEXT: ret +define <2 x i64> @neon_ushll2d_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_ushll2d_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_ushll2d_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: adrp x8, .LCPI161_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI161_0] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushl v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> %tmp3 = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %tmp2, <2 x i64> ) ret <2 x i64> %tmp3 } -define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushl_vscalar_constant_shift: +define <1 x i64> @neon_ushl_vscalar_constant_shift(ptr %A) nounwind { +; CHECK-LABEL: neon_ushl_vscalar_constant_shift: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: zip1.2s v0, v0, v1 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: shl d0, d0, #1 ; CHECK-NEXT: ret %tmp1 = load <1 x i32>, ptr %A @@ -2177,8 +2471,8 @@ define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind { ret <1 x i64> %tmp3 } -define i64 @neon.ushl_scalar_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.ushl_scalar_constant_shift: +define i64 @neon_ushl_scalar_constant_shift(ptr %A) nounwind { +; CHECK-LABEL: neon_ushl_scalar_constant_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: fmov d0, x8 @@ -2195,7 +2489,7 @@ define <8 x i16> @sshll8h(ptr %A) nounwind { ; CHECK-LABEL: sshll8h: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.8h v0, v0, #1 +; CHECK-NEXT: sshll v0.8h, v0.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> @@ -2207,7 +2501,7 @@ define <2 x i64> @sshll2d(ptr %A) nounwind { ; CHECK-LABEL: sshll2d: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.2d v0, v0, #1 +; CHECK-NEXT: sshll v0.2d, v0.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> @@ -2222,85 +2516,156 @@ declare <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64>, <2 x i64>) declare <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64>, <1 x i64>) declare i64 @llvm.aarch64.neon.sshl.i64(i64, i64) -define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl16b_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add.16b v0, v0, v0 -; CHECK-NEXT: ret +define <16 x i8> @neon_sshl16b_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl16b_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl16b_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.16b, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp2 } -define <16 x i8> @neon.sshl16b_non_splat_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl16b_non_splat_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI167_0 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI167_0] -; CHECK-NEXT: sshl.16b v0, v0, v1 -; CHECK-NEXT: ret +define <16 x i8> @neon_sshl16b_non_splat_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl16b_non_splat_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI167_0 +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI167_0] +; CHECK-SD-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl16b_non_splat_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI167_0 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI167_0] +; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp2 } -define <16 x i8> @neon.sshl16b_neg_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl16b_neg_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sshr.16b v0, v0, #2 -; CHECK-NEXT: ret +define <16 x i8> @neon_sshl16b_neg_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl16b_neg_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: sshr v0.16b, v0.16b, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl16b_neg_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.16b, #254 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp2 } -define <8 x i16> @neon.sshll8h_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll8h_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.8h v0, v0, #1 -; CHECK-NEXT: ret +define <8 x i16> @neon_sshll8h_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshll8h_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshll8h_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.8h, #1 +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> %tmp3 = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %tmp2, <8 x i16> ) ret <8 x i16> %tmp3 } -define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl4s_wrong_ext_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: sshll.8h v0, v0, #0 -; CHECK-NEXT: sshll.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_sshl4s_wrong_ext_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl4s_wrong_ext_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl4s_wrong_ext_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v1.b[3] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w10, s3 +; CHECK-GI-NEXT: fmov w11, s4 +; CHECK-GI-NEXT: sxtb w9, w9 +; CHECK-GI-NEXT: sxtb w10, w10 +; CHECK-GI-NEXT: sxtb w11, w11 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v1.h[1], w10 +; CHECK-GI-NEXT: mov v2.h[1], w11 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-GI-NEXT: sshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i8>, ptr %A %tmp2 = sext <4 x i8> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) ret <4 x i32> %tmp3 } -define <4 x i32> @neon.sshll4s_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll4s_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_sshll4s_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshll4s_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshll4s_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.4s, #1 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) ret <4 x i32> %tmp3 } -define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll4s_neg_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.4s v0, v0, #0 -; CHECK-NEXT: sshr.4s v0, v0, #1 -; CHECK-NEXT: ret +define <4 x i32> @neon_sshll4s_neg_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshll4s_neg_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshll4s_neg_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) @@ -2308,46 +2673,70 @@ define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind { } ; FIXME: should be constant folded. -define <4 x i32> @neon.sshl4s_constant_fold() nounwind { -; CHECK-LABEL: neon.sshl4s_constant_fold: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI173_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI173_0] -; CHECK-NEXT: shl.4s v0, v0, #2 -; CHECK-NEXT: ret +define <4 x i32> @neon_sshl4s_constant_fold() nounwind { +; CHECK-SD-LABEL: neon_sshl4s_constant_fold: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI173_0 +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI173_0] +; CHECK-SD-NEXT: shl v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl4s_constant_fold: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4s, #2 +; CHECK-GI-NEXT: adrp x8, .LCPI173_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI173_0] +; CHECK-GI-NEXT: sshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> , <4 x i32> ) ret <4 x i32> %tmp3 } -define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl4s_no_fold: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add.4s v0, v0, v0 -; CHECK-NEXT: ret +define <4 x i32> @neon_sshl4s_no_fold(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl4s_no_fold: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl4s_no_fold: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) ret <4 x i32> %tmp3 } -define <2 x i64> @neon.sshll2d_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll2d_constant_shift: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sshll.2d v0, v0, #1 -; CHECK-NEXT: ret +define <2 x i64> @neon_sshll2d_constant_shift(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshll2d_constant_shift: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshll2d_constant_shift: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: adrp x8, .LCPI175_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI175_0] +; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: sshl v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> ) ret <2 x i64> %tmp3 } -define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll_vscalar_constant_shift: +define <1 x i64> @neon_sshll_vscalar_constant_shift(ptr %A) nounwind { +; CHECK-LABEL: neon_sshll_vscalar_constant_shift: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: zip1.2s v0, v0, v1 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: shl d0, d0, #1 ; CHECK-NEXT: ret %tmp1 = load <1 x i32>, ptr %A @@ -2356,8 +2745,8 @@ define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind { ret <1 x i64> %tmp3 } -define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll_scalar_constant_shift: +define i64 @neon_sshll_scalar_constant_shift(ptr %A) nounwind { +; CHECK-LABEL: neon_sshll_scalar_constant_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: fmov d0, x8 @@ -2370,8 +2759,8 @@ define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind { ret i64 %tmp3 } -define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind { -; CHECK-LABEL: neon.sshll_scalar_constant_shift_m1: +define i64 @neon_sshll_scalar_constant_shift_m1(ptr %A) nounwind { +; CHECK-LABEL: neon_sshll_scalar_constant_shift_m1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: fmov d0, x8 @@ -2385,34 +2774,58 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind { } ; FIXME: should be constant folded. -define <2 x i64> @neon.sshl2d_constant_fold() nounwind { -; CHECK-LABEL: neon.sshl2d_constant_fold: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI179_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI179_0] -; CHECK-NEXT: add.2d v0, v0, v0 -; CHECK-NEXT: ret +define <2 x i64> @neon_sshl2d_constant_fold() nounwind { +; CHECK-SD-LABEL: neon_sshl2d_constant_fold: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI179_0 +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI179_0] +; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl2d_constant_fold: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI179_1 +; CHECK-GI-NEXT: adrp x9, .LCPI179_0 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI179_1] +; CHECK-GI-NEXT: ldr q1, [x9, :lo12:.LCPI179_0] +; CHECK-GI-NEXT: sshl v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> , <2 x i64> ) ret <2 x i64> %tmp3 } -define <2 x i64> @neon.sshl2d_no_fold(ptr %A) nounwind { -; CHECK-LABEL: neon.sshl2d_no_fold: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shl.2d v0, v0, #2 -; CHECK-NEXT: ret +define <2 x i64> @neon_sshl2d_no_fold(ptr %A) nounwind { +; CHECK-SD-LABEL: neon_sshl2d_no_fold: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: shl v0.2d, v0.2d, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: neon_sshl2d_no_fold: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI180_0 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI180_0] +; CHECK-GI-NEXT: sshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ret %tmp2 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> ) ret <2 x i64> %tmp3 } define <8 x i16> @sshll2_8h(ptr %A) nounwind { -; CHECK-LABEL: sshll2_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: sshll.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sshll2_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sshll2_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #1 +; CHECK-GI-NEXT: ret %load1 = load <16 x i8>, ptr %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> @@ -2421,11 +2834,18 @@ define <8 x i16> @sshll2_8h(ptr %A) nounwind { } define <4 x i32> @sshll2_4s(ptr %A) nounwind { -; CHECK-LABEL: sshll2_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: sshll.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sshll2_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sshll2_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #1 +; CHECK-GI-NEXT: ret %load1 = load <8 x i16>, ptr %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> @@ -2434,11 +2854,18 @@ define <4 x i32> @sshll2_4s(ptr %A) nounwind { } define <2 x i64> @sshll2_2d(ptr %A) nounwind { -; CHECK-LABEL: sshll2_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: sshll.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sshll2_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0, #8] +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sshll2_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #1 +; CHECK-GI-NEXT: ret %load1 = load <4 x i32>, ptr %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> @@ -2447,88 +2874,145 @@ define <2 x i64> @sshll2_2d(ptr %A) nounwind { } define <8 x i8> @sqshli8b(ptr %A) nounwind { -; CHECK-LABEL: sqshli8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshl.8b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sqshl v0.8b, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.8b, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: sqshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) ret <8 x i8> %tmp3 } define <4 x i16> @sqshli4h(ptr %A) nounwind { -; CHECK-LABEL: sqshli4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshl.4h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sqshl v0.4h, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4h, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: sqshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) ret <4 x i16> %tmp3 } define <2 x i32> @sqshli2s(ptr %A) nounwind { -; CHECK-LABEL: sqshli2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: sqshl.2s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: sqshl v0.2s, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2s, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: sqshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) ret <2 x i32> %tmp3 } define <16 x i8> @sqshli16b(ptr %A) nounwind { -; CHECK-LABEL: sqshli16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshl.16b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: sqshl v0.16b, v0.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.16b, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sqshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp3 } define <8 x i16> @sqshli8h(ptr %A) nounwind { -; CHECK-LABEL: sqshli8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshl.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: sqshl v0.8h, v0.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.8h, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sqshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) ret <8 x i16> %tmp3 } define <4 x i32> @sqshli4s(ptr %A) nounwind { -; CHECK-LABEL: sqshli4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshl.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: sqshl v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: sqshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) ret <4 x i32> %tmp3 } define <2 x i64> @sqshli2d(ptr %A) nounwind { -; CHECK-LABEL: sqshli2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sqshl.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshli2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: sqshl v0.2d, v0.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshli2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI190_0 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI190_0] +; CHECK-GI-NEXT: sqshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) ret <2 x i64> %tmp3 } define <8 x i8> @uqshli8b(ptr %A) nounwind { -; CHECK-LABEL: uqshli8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: uqshl.8b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: uqshl v0.8b, v0.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.8b, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: uqshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) ret <8 x i8> %tmp3 @@ -2537,9 +3021,9 @@ define <8 x i8> @uqshli8b(ptr %A) nounwind { define <8 x i8> @uqshli8b_1(ptr %A) nounwind { ; CHECK-LABEL: uqshli8b_1: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.8b v1, #8 -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: uqshl.8b v0, v0, v1 +; CHECK-NEXT: movi v0.8b, #8 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: uqshl v0.8b, v1.8b, v0.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) @@ -2547,78 +3031,130 @@ define <8 x i8> @uqshli8b_1(ptr %A) nounwind { } define <4 x i16> @uqshli4h(ptr %A) nounwind { -; CHECK-LABEL: uqshli4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: uqshl.4h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: uqshl v0.4h, v0.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4h, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: uqshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) ret <4 x i16> %tmp3 } define <2 x i32> @uqshli2s(ptr %A) nounwind { -; CHECK-LABEL: uqshli2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: uqshl.2s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: uqshl v0.2s, v0.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2s, #1 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: uqshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) ret <2 x i32> %tmp3 } define <16 x i8> @uqshli16b(ptr %A) nounwind { -; CHECK-LABEL: uqshli16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshl.16b v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: uqshl v0.16b, v0.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.16b, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: uqshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) ret <16 x i8> %tmp3 } define <8 x i16> @uqshli8h(ptr %A) nounwind { -; CHECK-LABEL: uqshli8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshl.8h v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: uqshl v0.8h, v0.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.8h, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: uqshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) ret <8 x i16> %tmp3 } define <4 x i32> @uqshli4s(ptr %A) nounwind { -; CHECK-LABEL: uqshli4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshl.4s v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: uqshl v0.4s, v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.4s, #1 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: uqshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) ret <4 x i32> %tmp3 } define <2 x i64> @uqshli2d(ptr %A) nounwind { -; CHECK-LABEL: uqshli2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: uqshl.2d v0, v0, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshli2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: uqshl v0.2d, v0.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshli2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI198_0 +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI198_0] +; CHECK-GI-NEXT: uqshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) ret <2 x i64> %tmp3 } define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ursra.8b v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: ursra v0.8b, v1.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) %tmp4 = load <8 x i8>, ptr %B @@ -2627,12 +3163,21 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind { } define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ursra.4h v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: ursra v0.4h, v1.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) %tmp4 = load <4 x i16>, ptr %B @@ -2641,12 +3186,21 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind { } define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ursra.2s v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: ursra v0.2s, v1.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: urshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) %tmp4 = load <2 x i32>, ptr %B @@ -2655,12 +3209,21 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind { } define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ursra.16b v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: ursra v0.16b, v1.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) %tmp4 = load <16 x i8>, ptr %B @@ -2669,12 +3232,21 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind { } define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ursra.8h v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: ursra v0.8h, v1.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) %tmp4 = load <8 x i16>, ptr %B @@ -2683,12 +3255,21 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind { } define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ursra.4s v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: ursra v0.4s, v1.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) %tmp4 = load <4 x i32>, ptr %B @@ -2697,12 +3278,21 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind { } define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: ursra2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ursra.2d v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ursra2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: ursra v0.2d, v1.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ursra2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: urshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) %tmp4 = load <2 x i64>, ptr %B @@ -2740,12 +3330,21 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind { } define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: srsra.8b v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: srsra v0.8b, v1.8b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> ) %tmp4 = load <8 x i8>, ptr %B @@ -2754,12 +3353,21 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind { } define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: srsra.4h v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: srsra v0.4h, v1.4h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> ) %tmp4 = load <4 x i16>, ptr %B @@ -2768,12 +3376,21 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind { } define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: srsra.2s v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: srsra v0.2s, v1.2s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d0, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: srshl v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> ) %tmp4 = load <2 x i32>, ptr %B @@ -2782,12 +3399,21 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind { } define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: srsra.16b v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: srsra v0.16b, v1.16b, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) %tmp4 = load <16 x i8>, ptr %B @@ -2796,12 +3422,21 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind { } define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: srsra.8h v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: srsra v0.8h, v1.8h, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> ) %tmp4 = load <8 x i16>, ptr %B @@ -2810,12 +3445,21 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind { } define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: srsra.4s v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: srsra v0.4s, v1.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) %tmp4 = load <4 x i32>, ptr %B @@ -2824,12 +3468,21 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind { } define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: srsra2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: srsra.2d v0, v1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srsra2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q0, [x1] +; CHECK-SD-NEXT: srsra v0.2d, v1.2d, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srsra2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr q1, [x0] +; CHECK-GI-NEXT: srshl v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> ) %tmp4 = load <2 x i64>, ptr %B @@ -2871,7 +3524,7 @@ define <8 x i8> @usra8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: usra.8b v0, v1, #1 +; CHECK-NEXT: usra v0.8b, v1.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = lshr <8 x i8> %tmp1, @@ -2885,7 +3538,7 @@ define <4 x i16> @usra4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: usra.4h v0, v1, #1 +; CHECK-NEXT: usra v0.4h, v1.4h, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = lshr <4 x i16> %tmp1, @@ -2899,7 +3552,7 @@ define <2 x i32> @usra2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: usra.2s v0, v1, #1 +; CHECK-NEXT: usra v0.2s, v1.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = lshr <2 x i32> %tmp1, @@ -2913,7 +3566,7 @@ define <16 x i8> @usra16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: usra.16b v0, v1, #1 +; CHECK-NEXT: usra v0.16b, v1.16b, #1 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = lshr <16 x i8> %tmp1, @@ -2927,7 +3580,7 @@ define <8 x i16> @usra8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: usra.8h v0, v1, #1 +; CHECK-NEXT: usra v0.8h, v1.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = lshr <8 x i16> %tmp1, @@ -2941,7 +3594,7 @@ define <4 x i32> @usra4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: usra.4s v0, v1, #1 +; CHECK-NEXT: usra v0.4s, v1.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = lshr <4 x i32> %tmp1, @@ -2955,7 +3608,7 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: usra.2d v0, v1, #1 +; CHECK-NEXT: usra v0.2d, v1.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = lshr <2 x i64> %tmp1, @@ -2965,12 +3618,20 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind { } define <1 x i64> @usra1d(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: usra1d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: usra d0, d1, #1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: usra1d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d0, [x1] +; CHECK-SD-NEXT: usra d0, d1, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: usra1d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr x8, [x0] +; CHECK-GI-NEXT: ldr x9, [x1] +; CHECK-GI-NEXT: add x8, x9, x8, lsr #1 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %tmp1 = load <1 x i64>, ptr %A %tmp3 = lshr <1 x i64> %tmp1, %tmp4 = load <1 x i64>, ptr %B @@ -2983,7 +3644,7 @@ define <8 x i8> @ssra8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ssra.8b v0, v1, #1 +; CHECK-NEXT: ssra v0.8b, v1.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp3 = ashr <8 x i8> %tmp1, @@ -2997,7 +3658,7 @@ define <4 x i16> @ssra4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ssra.4h v0, v1, #1 +; CHECK-NEXT: ssra v0.4h, v1.4h, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp3 = ashr <4 x i16> %tmp1, @@ -3011,7 +3672,7 @@ define <2 x i32> @ssra2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d1, [x0] ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ssra.2s v0, v1, #1 +; CHECK-NEXT: ssra v0.2s, v1.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp3 = ashr <2 x i32> %tmp1, @@ -3025,7 +3686,7 @@ define <16 x i8> @ssra16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ssra.16b v0, v1, #1 +; CHECK-NEXT: ssra v0.16b, v1.16b, #1 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp3 = ashr <16 x i8> %tmp1, @@ -3039,7 +3700,7 @@ define <8 x i16> @ssra8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ssra.8h v0, v1, #1 +; CHECK-NEXT: ssra v0.8h, v1.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp3 = ashr <8 x i16> %tmp1, @@ -3053,7 +3714,7 @@ define <4 x i32> @ssra4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ssra.4s v0, v1, #1 +; CHECK-NEXT: ssra v0.4s, v1.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp3 = ashr <4 x i32> %tmp1, @@ -3067,7 +3728,7 @@ define <2 x i64> @ssra2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ssra.2d v0, v1, #1 +; CHECK-NEXT: ssra v0.2d, v1.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp3 = ashr <2 x i64> %tmp1, @@ -3081,8 +3742,8 @@ define <8 x i8> @shr_orr8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushr.8b v0, v0, #1 -; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: ushr v0.8b, v0.8b, #1 +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp4 = load <8 x i8>, ptr %B @@ -3096,8 +3757,8 @@ define <4 x i16> @shr_orr4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushr.4h v0, v0, #1 -; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: ushr v0.4h, v0.4h, #1 +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp4 = load <4 x i16>, ptr %B @@ -3111,8 +3772,8 @@ define <2 x i32> @shr_orr2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ushr.2s v0, v0, #1 -; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: ushr v0.2s, v0.2s, #1 +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp4 = load <2 x i32>, ptr %B @@ -3126,8 +3787,8 @@ define <16 x i8> @shr_orr16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ushr.16b v0, v0, #1 -; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: ushr v0.16b, v0.16b, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp4 = load <16 x i8>, ptr %B @@ -3141,8 +3802,8 @@ define <8 x i16> @shr_orr8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ushr.8h v0, v0, #1 -; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp4 = load <8 x i16>, ptr %B @@ -3156,8 +3817,8 @@ define <4 x i32> @shr_orr4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ushr.4s v0, v0, #1 -; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp4 = load <4 x i32>, ptr %B @@ -3171,8 +3832,8 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ushr.2d v0, v0, #1 -; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: ushr v0.2d, v0.2d, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp4 = load <2 x i64>, ptr %B @@ -3182,13 +3843,21 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind { } define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: add.8b v0, v0, v0 -; CHECK-NEXT: orr.8b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: shl v0.8b, v0.8b, #1 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp4 = load <8 x i8>, ptr %B %tmp3 = shl <8 x i8> %tmp1, @@ -3197,13 +3866,21 @@ define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind { } define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: add.4h v0, v0, v0 -; CHECK-NEXT: orr.8b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: shl v0.4h, v0.4h, #1 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp4 = load <4 x i16>, ptr %B %tmp3 = shl <4 x i16> %tmp1, @@ -3212,13 +3889,21 @@ define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind { } define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: add.2s v0, v0, v0 -; CHECK-NEXT: orr.8b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #1 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp4 = load <2 x i32>, ptr %B %tmp3 = shl <2 x i32> %tmp1, @@ -3227,13 +3912,21 @@ define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind { } define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: add.16b v0, v0, v0 -; CHECK-NEXT: orr.16b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: ldr q1, [x1] +; CHECK-SD-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: shl v0.16b, v0.16b, #1 +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp4 = load <16 x i8>, ptr %B %tmp3 = shl <16 x i8> %tmp1, @@ -3242,13 +3935,21 @@ define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind { } define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: add.8h v0, v0, v0 -; CHECK-NEXT: orr.16b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: ldr q1, [x1] +; CHECK-SD-NEXT: add v0.8h, v0.8h, v0.8h +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: shl v0.8h, v0.8h, #1 +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp4 = load <8 x i16>, ptr %B %tmp3 = shl <8 x i16> %tmp1, @@ -3257,13 +3958,21 @@ define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind { } define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: add.4s v0, v0, v0 -; CHECK-NEXT: orr.16b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: ldr q1, [x1] +; CHECK-SD-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #1 +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp4 = load <4 x i32>, ptr %B %tmp3 = shl <4 x i32> %tmp1, @@ -3272,13 +3981,21 @@ define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind { } define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: shl_orr2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: add.2d v0, v0, v0 -; CHECK-NEXT: orr.16b v0, v0, v1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shl_orr2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr q0, [x0] +; CHECK-SD-NEXT: ldr q1, [x1] +; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_orr2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: shl v0.2d, v0.2d, #1 +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp4 = load <2 x i64>, ptr %B %tmp3 = shl <2 x i64> %tmp1, @@ -3287,20 +4004,32 @@ define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind { } define <8 x i16> @shll(<8 x i8> %in) { -; CHECK-LABEL: shll: -; CHECK: // %bb.0: -; CHECK-NEXT: shll.8h v0, v0, #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shll: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shll v0.8h, v0.8b, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shll: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: shl v0.8h, v0.8h, #8 +; CHECK-GI-NEXT: ret %ext = zext <8 x i8> %in to <8 x i16> %res = shl <8 x i16> %ext, ret <8 x i16> %res } define <4 x i32> @shll_high(<8 x i16> %in) { -; CHECK-LABEL: shll_high: -; CHECK: // %bb.0: -; CHECK-NEXT: shll2.4s v0, v0, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shll_high: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shll2 v0.4s, v0.8h, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shll_high: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #16 +; CHECK-GI-NEXT: ret %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> %ext = zext <4 x i16> %extract to <4 x i32> %res = shl <4 x i32> %ext, @@ -3312,7 +4041,7 @@ define <8 x i8> @sli8b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sli.8b v0, v1, #1 +; CHECK-NEXT: sli v0.8b, v1.8b, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B @@ -3325,7 +4054,7 @@ define <4 x i16> @sli4h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sli.4h v0, v1, #1 +; CHECK-NEXT: sli v0.4h, v1.4h, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B @@ -3338,7 +4067,7 @@ define <2 x i32> @sli2s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sli.2s v0, v1, #1 +; CHECK-NEXT: sli v0.2s, v1.2s, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B @@ -3364,7 +4093,7 @@ define <16 x i8> @sli16b(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sli.16b v0, v1, #1 +; CHECK-NEXT: sli v0.16b, v1.16b, #1 ; CHECK-NEXT: ret %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B @@ -3377,7 +4106,7 @@ define <8 x i16> @sli8h(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sli.8h v0, v1, #1 +; CHECK-NEXT: sli v0.8h, v1.8h, #1 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -3390,7 +4119,7 @@ define <4 x i32> @sli4s(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sli.4s v0, v1, #1 +; CHECK-NEXT: sli v0.4s, v1.4s, #1 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B @@ -3403,7 +4132,7 @@ define <2 x i64> @sli2d(ptr %A, ptr %B) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sli.2d v0, v1, #1 +; CHECK-NEXT: sli v0.2d, v1.2d, #1 ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B @@ -3422,21 +4151,37 @@ declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounw declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) { -; CHECK-LABEL: ashr_v1i64: -; CHECK: // %bb.0: -; CHECK-NEXT: neg d1, d1 -; CHECK-NEXT: sshl d0, d0, d1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ashr_v1i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg d1, d1 +; CHECK-SD-NEXT: sshl d0, d0, d1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ashr_v1i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: asr x8, x8, x9 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %c = ashr <1 x i64> %a, %b ret <1 x i64> %c } define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: sqshl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sqshl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: sqshl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3445,11 +4190,19 @@ entry: } define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: uqshl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uqshl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uqshl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: uqshl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3458,11 +4211,19 @@ entry: } define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: srshl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: srshl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: srshl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: srshl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3471,11 +4232,19 @@ entry: } define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: urshl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: urshl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: urshl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: urshl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3486,8 +4255,8 @@ entry: define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { ; CHECK-LABEL: sqshlu_zero_shift_amount: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: sqshlu.2d v0, v0, #0 +; CHECK-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-NEXT: sqshlu v0.2d, v0.2d, #0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: @@ -3498,11 +4267,19 @@ entry: } define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: sshl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sshl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sshl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: sshl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3511,11 +4288,19 @@ entry: } define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) { -; CHECK-LABEL: ushl_zero_shift_amount: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addp.2d v0, v0, v1 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ushl_zero_shift_amount: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ushl_zero_shift_amount: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: addp v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: ret entry: %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer) @@ -3526,8 +4311,8 @@ entry: define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) { ; CHECK-LABEL: sext_rshrn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rshrn.4h v0, v0, #13 -; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: rshrn v0.4h, v0.4s, #13 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: ret entry: %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13) @@ -3538,8 +4323,8 @@ entry: define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) { ; CHECK-LABEL: zext_rshrn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rshrn.4h v0, v0, #13 -; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: rshrn v0.4h, v0.4s, #13 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ret entry: %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13) @@ -3550,9 +4335,9 @@ entry: define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) { ; CHECK-LABEL: mul_rshrn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi.4s v1, #3 -; CHECK-NEXT: add.4s v0, v0, v1 -; CHECK-NEXT: rshrn.4h v0, v0, #13 +; CHECK-NEXT: movi v1.4s, #3 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: rshrn v0.4h, v0.4s, #13 ; CHECK-NEXT: ret entry: %b = add <4 x i32> %a, @@ -3561,15 +4346,61 @@ entry: } define <8 x i16> @signbits_vashr(<8 x i16> %a) { -; CHECK-LABEL: signbits_vashr: -; CHECK: // %bb.0: -; CHECK-NEXT: sshr.8h v0, v0, #8 -; CHECK-NEXT: sshr.8h v0, v0, #9 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: signbits_vashr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sshr v0.8h, v0.8h, #8 +; CHECK-SD-NEXT: sshr v0.8h, v0.8h, #9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: signbits_vashr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvni v1.8h, #7 +; CHECK-GI-NEXT: mvni v2.8h, #8 +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #7 +; CHECK-GI-NEXT: ret %b = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> ) %c = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %b, <8 x i16> ) %d = ashr <8 x i16> %c, ret <8 x i16> %d } +define <2 x i8> @lshr_trunc_v2i64_v2i8(<2 x i64> %a) { +; CHECK-LABEL: lshr_trunc_v2i64_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shrn v0.2s, v0.2d, #16 +; CHECK-NEXT: ret + %b = lshr <2 x i64> %a, + %c = trunc <2 x i64> %b to <2 x i8> + ret <2 x i8> %c +} + +define <2 x i8> @ashr_trunc_v2i64_v2i8(<2 x i64> %a) { +; CHECK-LABEL: ashr_trunc_v2i64_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shrn v0.2s, v0.2d, #16 +; CHECK-NEXT: ret + %b = ashr <2 x i64> %a, + %c = trunc <2 x i64> %b to <2 x i8> + ret <2 x i8> %c +} + +define <2 x i8> @shl_trunc_v2i64_v2i8(<2 x i64> %a) { +; CHECK-SD-LABEL: shl_trunc_v2i64_v2i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: xtn v0.2s, v0.2d +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_trunc_v2i64_v2i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: shl v0.2d, v0.2d, #16 +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: ret + %b = shl <2 x i64> %a, + %c = trunc <2 x i64> %b to <2 x i8> + ret <2 x i8> %c +} + declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 5a5dee0b53d43..4cb1d5b2fb345 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -5,7 +5,7 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(, , i32 immarg, ) -define fastcc i8 @allocno_reload_assign() { +define fastcc i8 @allocno_reload_assign(ptr %p) { ; CHECK-LABEL: allocno_reload_assign: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, xzr @@ -14,8 +14,8 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s ; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: uzp1 p8.b, p0.b, p0.b +; CHECK-NEXT: mov z0.b, p8/z, #1 // =0x1 ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: uunpklo z1.h, z0.b @@ -30,34 +30,35 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p2.h, p1.b -; CHECK-NEXT: punpkhi p3.h, p1.b +; CHECK-NEXT: punpkhi p4.h, p1.b ; CHECK-NEXT: uunpklo z0.d, z2.s ; CHECK-NEXT: uunpkhi z1.d, z2.s -; CHECK-NEXT: punpklo p5.h, p0.b +; CHECK-NEXT: punpklo p6.h, p0.b ; CHECK-NEXT: uunpklo z2.d, z3.s ; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: punpkhi p7.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpklo z4.d, z5.s ; CHECK-NEXT: uunpkhi z5.d, z5.s ; CHECK-NEXT: uunpklo z6.d, z7.s ; CHECK-NEXT: uunpkhi z7.d, z7.s -; CHECK-NEXT: punpklo p0.h, p2.b -; CHECK-NEXT: punpkhi p1.h, p2.b -; CHECK-NEXT: punpklo p2.h, p3.b -; CHECK-NEXT: punpkhi p3.h, p3.b -; CHECK-NEXT: punpklo p4.h, p5.b -; CHECK-NEXT: punpkhi p5.h, p5.b -; CHECK-NEXT: punpklo p6.h, p7.b -; CHECK-NEXT: punpkhi p7.h, p7.b +; CHECK-NEXT: punpklo p1.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p3.h, p4.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpklo p5.h, p6.b +; CHECK-NEXT: punpkhi p6.h, p6.b +; CHECK-NEXT: punpklo p7.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: st1b { z0.d }, p0, [z16.d] -; CHECK-NEXT: st1b { z1.d }, p1, [z16.d] -; CHECK-NEXT: st1b { z2.d }, p2, [z16.d] -; CHECK-NEXT: st1b { z3.d }, p3, [z16.d] -; CHECK-NEXT: st1b { z4.d }, p4, [z16.d] -; CHECK-NEXT: st1b { z5.d }, p5, [z16.d] -; CHECK-NEXT: st1b { z6.d }, p6, [z16.d] -; CHECK-NEXT: st1b { z7.d }, p7, [z16.d] +; CHECK-NEXT: st1b { z0.d }, p1, [z16.d] +; CHECK-NEXT: st1b { z1.d }, p2, [z16.d] +; CHECK-NEXT: st1b { z2.d }, p3, [z16.d] +; CHECK-NEXT: st1b { z3.d }, p4, [z16.d] +; CHECK-NEXT: st1b { z4.d }, p5, [z16.d] +; CHECK-NEXT: st1b { z5.d }, p6, [z16.d] +; CHECK-NEXT: st1b { z6.d }, p7, [z16.d] +; CHECK-NEXT: st1b { z7.d }, p0, [z16.d] +; CHECK-NEXT: str p8, [x0] ; CHECK-NEXT: b .LBB0_1 br label %1 @@ -66,6 +67,7 @@ define fastcc i8 @allocno_reload_assign() { %constexpr1 = shufflevector %constexpr, poison, zeroinitializer %constexpr2 = xor %constexpr1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) call void @llvm.masked.scatter.nxv16i8.nxv16p0( zeroinitializer, zeroinitializer, i32 0, %constexpr2) + store %constexpr, ptr %p, align 16 br label %1 } diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll new file mode 100644 index 0000000000000..2388a0f206a51 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll @@ -0,0 +1,225 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + + +define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) { +; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: cmp w8, #5 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %icmp = icmp ult <4 x i32> %a, splat (i32 5) + %ext = extractelement <4 x i1> %icmp, i32 1 + ret i1 %ext +} + +define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) { +; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: cmp w8, #7 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret + %icmp = icmp ult <4 x i32> splat(i32 7), %a + %ext = extractelement <4 x i1> %icmp, i32 1 + ret i1 %ext +} + +define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) { +; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: cmp w8, #234 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %icmp = icmp ult <4 x i32> %a, + %ext = extractelement <4 x i1> %icmp, i32 1 + ret i1 %ext +} + +define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) { +; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs: +; CHECK: // %bb.0: +; CHECK-NEXT: mov s0, v0.s[1] +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret + %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0) + %ext = extractelement <4 x i1> %fcmp, i32 1 + ret i1 %ext +} + +define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: vector_loop_with_icmp: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov w8, #4 // =0x4 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov w10, #1 // =0x1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2 +; CHECK-NEXT: b .LBB4_2 +; CHECK-NEXT: .LBB4_1: // %pred.store.continue18 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-NEXT: subs x9, x9, #4 +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: b.eq .LBB4_10 +; CHECK-NEXT: .LBB4_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: cmp x11, #14 +; CHECK-NEXT: b.hi .LBB4_4 +; CHECK-NEXT: // %bb.3: // %pred.store.if +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: stur w10, [x8, #-8] +; CHECK-NEXT: .LBB4_4: // %pred.store.continue +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: mov x11, v0.d[1] +; CHECK-NEXT: cmp x11, #14 +; CHECK-NEXT: b.hi .LBB4_6 +; CHECK-NEXT: // %bb.5: // %pred.store.if5 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: stur w10, [x8, #-4] +; CHECK-NEXT: .LBB4_6: // %pred.store.continue6 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: cmp x11, #14 +; CHECK-NEXT: b.hi .LBB4_8 +; CHECK-NEXT: // %bb.7: // %pred.store.if7 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: str w10, [x8] +; CHECK-NEXT: .LBB4_8: // %pred.store.continue8 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: cmp x11, #14 +; CHECK-NEXT: b.hi .LBB4_1 +; CHECK-NEXT: // %bb.9: // %pred.store.if9 +; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 +; CHECK-NEXT: str w10, [x8, #4] +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_10: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ] + %vec.ind = phi <4 x i64> [ , %entry ], [ %vec.ind.next, %pred.store.continue18 ] + %0 = icmp ult <4 x i64> %vec.ind, + %1 = extractelement <4 x i1> %0, i64 0 + br i1 %1, label %pred.store.if, label %pred.store.continue + +pred.store.if: + %2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 1, ptr %2, align 4 + br label %pred.store.continue + +pred.store.continue: + %3 = extractelement <4 x i1> %0, i64 1 + br i1 %3, label %pred.store.if5, label %pred.store.continue6 + +pred.store.if5: + %4 = or disjoint i64 %index, 1 + %5 = getelementptr inbounds i32, ptr %dest, i64 %4 + store i32 1, ptr %5, align 4 + br label %pred.store.continue6 + +pred.store.continue6: + %6 = extractelement <4 x i1> %0, i64 2 + br i1 %6, label %pred.store.if7, label %pred.store.continue8 + +pred.store.if7: + %7 = or disjoint i64 %index, 2 + %8 = getelementptr inbounds i32, ptr %dest, i64 %7 + store i32 1, ptr %8, align 4 + br label %pred.store.continue8 + +pred.store.continue8: + %9 = extractelement <4 x i1> %0, i64 3 + br i1 %9, label %pred.store.if9, label %pred.store.continue18 + +pred.store.if9: + %10 = or disjoint i64 %index, 3 + %11 = getelementptr inbounds i32, ptr %dest, i64 %10 + store i32 1, ptr %11, align 4 + br label %pred.store.continue18 + +pred.store.continue18: + %index.next = add i64 %index, 4 + %vec.ind.next = add <4 x i64> %vec.ind, + %24 = icmp eq i64 %index.next, 16 + br i1 %24, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + + +; Negative tests + +define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) { +; CHECK-LABEL: extract_icmp_v4i32_splat_rhs: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.4s, w0 +; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %ins = insertelement <4 x i32> poison, i32 %b, i32 0 + %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer + %icmp = icmp ult <4 x i32> %a, %splat + %ext = extractelement <4 x i1> %icmp, i32 1 + ret i1 %ext +} + +define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) { +; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #235 +; CHECK-NEXT: adrp x9, .LCPI6_0 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_0] +; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v1.4h, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: and w0, w9, #0x1 +; CHECK-NEXT: strb w10, [x8] +; CHECK-NEXT: ret + %icmp = icmp ult <4 x i32> %a, splat(i32 235) + %ext = extractelement <4 x i1> %icmp, i32 1 + store <4 x i1> %icmp, ptr %p, align 4 + ret i1 %ext +} + +define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) { +; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movi v1.4s, #127 +; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: bfi x8, x0, #1, #2 +; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: str d0, [sp, #8] +; CHECK-NEXT: ldrh w8, [x8] +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %icmp = icmp ult <4 x i32> %a, splat(i32 127) + %ext = extractelement <4 x i1> %icmp, i32 %c + ret i1 %ext +} diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 6a4ab837fc472..531562d3aa678 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -4,7 +4,6 @@ ; CHECK-GI: warning: Instruction selection used fallback path for v16i4 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v16i1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i128 declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) @@ -498,21 +497,45 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { } define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { -; CHECK-LABEL: v2i128: -; CHECK: // %bb.0: -; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: eor x11, x10, #0x8000000000000000 -; CHECK-NEXT: csel x0, x10, x8, vs -; CHECK-NEXT: csel x1, x11, x9, vs -; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x9, x3, x7 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: eor x11, x10, #0x8000000000000000 -; CHECK-NEXT: csel x2, x10, x8, vs -; CHECK-NEXT: csel x3, x11, x9, vs -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i128: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adds x8, x0, x4 +; CHECK-SD-NEXT: adcs x9, x1, x5 +; CHECK-SD-NEXT: asr x10, x9, #63 +; CHECK-SD-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-SD-NEXT: csel x0, x10, x8, vs +; CHECK-SD-NEXT: csel x1, x11, x9, vs +; CHECK-SD-NEXT: adds x8, x2, x6 +; CHECK-SD-NEXT: adcs x9, x3, x7 +; CHECK-SD-NEXT: asr x10, x9, #63 +; CHECK-SD-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-SD-NEXT: csel x2, x10, x8, vs +; CHECK-SD-NEXT: csel x3, x11, x9, vs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i128: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adds x9, x0, x4 +; CHECK-GI-NEXT: mov w8, wzr +; CHECK-GI-NEXT: mov x13, #-9223372036854775808 // =0x8000000000000000 +; CHECK-GI-NEXT: adcs x10, x1, x5 +; CHECK-GI-NEXT: asr x11, x10, #63 +; CHECK-GI-NEXT: cset w12, vs +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: adc x14, x11, x13 +; CHECK-GI-NEXT: tst w12, #0x1 +; CHECK-GI-NEXT: csel x0, x11, x9, ne +; CHECK-GI-NEXT: csel x1, x14, x10, ne +; CHECK-GI-NEXT: adds x9, x2, x6 +; CHECK-GI-NEXT: adcs x10, x3, x7 +; CHECK-GI-NEXT: asr x11, x10, #63 +; CHECK-GI-NEXT: cset w12, vs +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: adc x8, x11, x13 +; CHECK-GI-NEXT: tst w12, #0x1 +; CHECK-GI-NEXT: csel x2, x11, x9, ne +; CHECK-GI-NEXT: csel x3, x8, x10, ne +; CHECK-GI-NEXT: ret %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z } diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 86a503038766c..be4a5843e8215 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -4,7 +4,6 @@ ; CHECK-GI: warning: Instruction selection used fallback path for v16i4 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v16i1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i128 declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) @@ -501,21 +500,45 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { } define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { -; CHECK-LABEL: v2i128: -; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x0, x4 -; CHECK-NEXT: sbcs x9, x1, x5 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: eor x11, x10, #0x8000000000000000 -; CHECK-NEXT: csel x0, x10, x8, vs -; CHECK-NEXT: csel x1, x11, x9, vs -; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x9, x3, x7 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: eor x11, x10, #0x8000000000000000 -; CHECK-NEXT: csel x2, x10, x8, vs -; CHECK-NEXT: csel x3, x11, x9, vs -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i128: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: subs x8, x0, x4 +; CHECK-SD-NEXT: sbcs x9, x1, x5 +; CHECK-SD-NEXT: asr x10, x9, #63 +; CHECK-SD-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-SD-NEXT: csel x0, x10, x8, vs +; CHECK-SD-NEXT: csel x1, x11, x9, vs +; CHECK-SD-NEXT: subs x8, x2, x6 +; CHECK-SD-NEXT: sbcs x9, x3, x7 +; CHECK-SD-NEXT: asr x10, x9, #63 +; CHECK-SD-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-SD-NEXT: csel x2, x10, x8, vs +; CHECK-SD-NEXT: csel x3, x11, x9, vs +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i128: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: subs x9, x0, x4 +; CHECK-GI-NEXT: mov w8, wzr +; CHECK-GI-NEXT: mov x13, #-9223372036854775808 // =0x8000000000000000 +; CHECK-GI-NEXT: sbcs x10, x1, x5 +; CHECK-GI-NEXT: asr x11, x10, #63 +; CHECK-GI-NEXT: cset w12, vs +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: adc x14, x11, x13 +; CHECK-GI-NEXT: tst w12, #0x1 +; CHECK-GI-NEXT: csel x0, x11, x9, ne +; CHECK-GI-NEXT: csel x1, x14, x10, ne +; CHECK-GI-NEXT: subs x9, x2, x6 +; CHECK-GI-NEXT: sbcs x10, x3, x7 +; CHECK-GI-NEXT: asr x11, x10, #63 +; CHECK-GI-NEXT: cset w12, vs +; CHECK-GI-NEXT: cmp w8, #1 +; CHECK-GI-NEXT: adc x8, x11, x13 +; CHECK-GI-NEXT: tst w12, #0x1 +; CHECK-GI-NEXT: csel x2, x11, x9, ne +; CHECK-GI-NEXT: csel x3, x8, x10, ne +; CHECK-GI-NEXT: ret %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z } diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index d4587c3439967..924bd3981779e 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -4,7 +4,6 @@ ; CHECK-GI: warning: Instruction selection used fallback path for v16i4 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v16i1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i128 declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) @@ -492,17 +491,33 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { } define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { -; CHECK-LABEL: v2i128: -; CHECK: // %bb.0: -; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: csinv x0, x8, xzr, lo -; CHECK-NEXT: csinv x1, x9, xzr, lo -; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x9, x3, x7 -; CHECK-NEXT: csinv x2, x8, xzr, lo -; CHECK-NEXT: csinv x3, x9, xzr, lo -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i128: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adds x8, x0, x4 +; CHECK-SD-NEXT: adcs x9, x1, x5 +; CHECK-SD-NEXT: csinv x0, x8, xzr, lo +; CHECK-SD-NEXT: csinv x1, x9, xzr, lo +; CHECK-SD-NEXT: adds x8, x2, x6 +; CHECK-SD-NEXT: adcs x9, x3, x7 +; CHECK-SD-NEXT: csinv x2, x8, xzr, lo +; CHECK-SD-NEXT: csinv x3, x9, xzr, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i128: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adds x8, x0, x4 +; CHECK-GI-NEXT: adcs x9, x1, x5 +; CHECK-GI-NEXT: cset w10, hs +; CHECK-GI-NEXT: tst w10, #0x1 +; CHECK-GI-NEXT: csinv x0, x8, xzr, eq +; CHECK-GI-NEXT: csinv x1, x9, xzr, eq +; CHECK-GI-NEXT: adds x8, x2, x6 +; CHECK-GI-NEXT: adcs x9, x3, x7 +; CHECK-GI-NEXT: cset w10, hs +; CHECK-GI-NEXT: tst w10, #0x1 +; CHECK-GI-NEXT: csinv x2, x8, xzr, eq +; CHECK-GI-NEXT: csinv x3, x9, xzr, eq +; CHECK-GI-NEXT: ret %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z } diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index 123f4280bd8ff..a623eb554cac7 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -4,7 +4,6 @@ ; CHECK-GI: warning: Instruction selection used fallback path for v16i4 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v16i1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2i128 declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) @@ -490,17 +489,33 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { } define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { -; CHECK-LABEL: v2i128: -; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x0, x4 -; CHECK-NEXT: sbcs x9, x1, x5 -; CHECK-NEXT: csel x0, xzr, x8, lo -; CHECK-NEXT: csel x1, xzr, x9, lo -; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x9, x3, x7 -; CHECK-NEXT: csel x2, xzr, x8, lo -; CHECK-NEXT: csel x3, xzr, x9, lo -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i128: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: subs x8, x0, x4 +; CHECK-SD-NEXT: sbcs x9, x1, x5 +; CHECK-SD-NEXT: csel x0, xzr, x8, lo +; CHECK-SD-NEXT: csel x1, xzr, x9, lo +; CHECK-SD-NEXT: subs x8, x2, x6 +; CHECK-SD-NEXT: sbcs x9, x3, x7 +; CHECK-SD-NEXT: csel x2, xzr, x8, lo +; CHECK-SD-NEXT: csel x3, xzr, x9, lo +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i128: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: subs x8, x0, x4 +; CHECK-GI-NEXT: sbcs x9, x1, x5 +; CHECK-GI-NEXT: cset w10, lo +; CHECK-GI-NEXT: tst w10, #0x1 +; CHECK-GI-NEXT: csel x0, xzr, x8, ne +; CHECK-GI-NEXT: csel x1, xzr, x9, ne +; CHECK-GI-NEXT: subs x8, x2, x6 +; CHECK-GI-NEXT: sbcs x9, x3, x7 +; CHECK-GI-NEXT: cset w10, lo +; CHECK-GI-NEXT: tst w10, #0x1 +; CHECK-GI-NEXT: csel x2, xzr, x8, ne +; CHECK-GI-NEXT: csel x3, xzr, x9, ne +; CHECK-GI-NEXT: ret %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 824d3708c027d..33dd2bd540ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -4,29 +4,15 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s - ; GCN-LABEL: {{^}}fold_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4 -; OPT-NEXT: ret void define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { + bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 store i32 %tmp, ptr addrspace(1) %arg, align 4 @@ -34,18 +20,12 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} ; GCN-NOT: cndmask ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1 -; OPT: store i32 %tmp2, ptr addrspace(1) %arg -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { bb: @@ -57,13 +37,6 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( - -; OPT: bb: -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: bb3: -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { bb: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll index eafc9c644bdbf..2f7c93eb1c0de 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll @@ -186,8 +186,7 @@ define double @fsgnjn_d(double %a, double %b) nounwind { ; ; CHECKIFD-LABEL: fsgnjn_d: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fneg.d fa5, fa1 -; CHECKIFD-NEXT: fsgnj.d fa0, fa0, fa5 +; CHECKIFD-NEXT: fsgnjn.d fa0, fa0, fa1 ; CHECKIFD-NEXT: ret ; ; RV32I-LABEL: fsgnjn_d: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll index 0d210890c41f9..7fe4d2ef797af 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll @@ -183,8 +183,7 @@ define float @fsgnjn_s(float %a, float %b) nounwind { ; CHECKIF-LABEL: fsgnjn_s: ; CHECKIF: # %bb.0: ; CHECKIF-NEXT: fadd.s fa5, fa0, fa1 -; CHECKIF-NEXT: fneg.s fa5, fa5 -; CHECKIF-NEXT: fsgnj.s fa0, fa0, fa5 +; CHECKIF-NEXT: fsgnjn.s fa0, fa0, fa5 ; CHECKIF-NEXT: ret ; ; RV32I-LABEL: fsgnjn_s: diff --git a/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir new file mode 100644 index 0000000000000..c69bc1b5eca64 --- /dev/null +++ b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir @@ -0,0 +1,27 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=none %s -o - | FileCheck %s +# REQUIRES: riscv64-registered-target + +# During the MachineVerifier, it assumes that used registers have been defined +# In this test case, while $v12_v13_v14_v15_v16 covers $v14_v15, +# $v14_v15 is not a sub-register of $v14m2 even though they share the same register. +# This corner case can be resolved by checking the register using RegUnit. + +... +--- +name: func +tracksRegLiveness: true +tracksDebugUserValues: true +body: | + bb.0: + liveins: $v0, $v8, $v9, $v10, $v11 + + ; CHECK-LABEL: name: func + ; CHECK: liveins: $v0, $v8, $v9, $v10, $v11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16 + renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16 + +... diff --git a/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll b/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll new file mode 100644 index 0000000000000..f7a37015e07fc --- /dev/null +++ b/llvm/test/Transforms/Inline/LoongArch/inline-target-features.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -mtriple=loongarch64-unknown-linux-gnu -S -passes=inline | FileCheck %s +; RUN: opt < %s -mtriple=loongarch64-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s +; Check that we only inline when we have compatible target attributes. + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "loongarch64-unknown-linux-gnu" + +define i32 @foo() #0 { +entry: + %call = call i32 (...) @baz() + ret i32 %call +; CHECK-LABEL: foo +; CHECK: call i32 (...) @baz() +} +declare i32 @baz(...) #0 + +define i32 @bar() #1 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: bar +; CHECK: call i32 (...) @baz() +} + +define i32 @qux() #0 { +entry: + %call = call i32 @bar() + ret i32 %call +; CHECK-LABEL: qux +; CHECK: call i32 @bar() +} + +attributes #0 = { "target-cpu"="generic-la64" "target-features"="+f,+d" } +attributes #1 = { "target-cpu"="generic-la64" "target-features"="+f,+d,+lsx,+lasx" } diff --git a/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg b/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg new file mode 100644 index 0000000000000..cc24278acbb41 --- /dev/null +++ b/llvm/test/Transforms/Inline/LoongArch/lit.local.cfg @@ -0,0 +1,2 @@ +if not "LoongArch" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll new file mode 100644 index 0000000000000..d9c105f753e26 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s + +define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]] +; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, ptr addrspace(1) %arg, align 4 + ret void +} + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 +; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT: [[BB2]]: +; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: br label %[[BB3]] +; OPT: [[BB3]]: +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W32: [[BB2]]: +; OPT-W32-NEXT: br label %[[BB3]] +; OPT-W32: [[BB3]]: +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W64: [[BB2]]: +; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: br label %[[BB3]] +; OPT-W64: [[BB3]]: +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, ptr addrspace(1) %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable } diff --git a/llvm/test/Transforms/InstCombine/and-fcmp.ll b/llvm/test/Transforms/InstCombine/and-fcmp.ll index 30b9fca6e97ad..c7bbc8ab56f9a 100644 --- a/llvm/test/Transforms/InstCombine/and-fcmp.ll +++ b/llvm/test/Transforms/InstCombine/and-fcmp.ll @@ -5044,11 +5044,9 @@ define i1 @isnormal_logical_select_0_fmf1(half %x) { define i1 @and_fcmp_reassoc1(i1 %x, double %a, double %b) { ; CHECK-LABEL: @and_fcmp_reassoc1( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = and i1 [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[RETVAL1:%.*]] = and i1 [[RETVAL]], [[CMP1]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp ult double %a, %b %cmp1 = fcmp ugt double %a, %b @@ -5059,11 +5057,9 @@ define i1 @and_fcmp_reassoc1(i1 %x, double %a, double %b) { define i1 @and_fcmp_reassoc2(i1 %x, double %a, double %b) { ; CHECK-LABEL: @and_fcmp_reassoc2( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = and i1 [[X:%.*]], [[TMP1]] -; CHECK-NEXT: [[RETVAL1:%.*]] = and i1 [[RETVAL]], [[CMP1]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp ult double %a, %b %cmp1 = fcmp ugt double %a, %b @@ -5074,11 +5070,9 @@ define i1 @and_fcmp_reassoc2(i1 %x, double %a, double %b) { define i1 @and_fcmp_reassoc3(i1 %x, double %a, double %b) { ; CHECK-LABEL: @and_fcmp_reassoc3( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = and i1 [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[RETVAL1:%.*]] = and i1 [[CMP1]], [[RETVAL]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp ult double %a, %b %cmp1 = fcmp ugt double %a, %b @@ -5089,11 +5083,9 @@ define i1 @and_fcmp_reassoc3(i1 %x, double %a, double %b) { define i1 @and_fcmp_reassoc4(i1 %x, double %a, double %b) { ; CHECK-LABEL: @and_fcmp_reassoc4( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ult double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ugt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = and i1 [[X:%.*]], [[TMP1]] -; CHECK-NEXT: [[RETVAL1:%.*]] = and i1 [[CMP1]], [[RETVAL]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp ult double %a, %b %cmp1 = fcmp ugt double %a, %b diff --git a/llvm/test/Transforms/InstCombine/eq-of-parts.ll b/llvm/test/Transforms/InstCombine/eq-of-parts.ll index 00ee7bf643286..d07c2e6a5be52 100644 --- a/llvm/test/Transforms/InstCombine/eq-of-parts.ll +++ b/llvm/test/Transforms/InstCombine/eq-of-parts.ll @@ -1441,11 +1441,7 @@ define i1 @ne_optimized_highbits_cmp_todo_overlapping(i32 %x, i32 %y) { define i1 @and_trunc_i1(i8 %a1, i8 %a2) { ; CHECK-LABEL: @and_trunc_i1( -; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[XOR]], 2 -; CHECK-NEXT: [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1 -; CHECK-NEXT: [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true -; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]] +; CHECK-NEXT: [[AND:%.*]] = icmp eq i8 [[A1:%.*]], [[A2:%.*]] ; CHECK-NEXT: ret i1 [[AND]] ; %xor = xor i8 %a1, %a2 @@ -1494,10 +1490,7 @@ define i1 @and_trunc_i1_wrong_operands(i8 %a1, i8 %a2, i8 %a3) { define i1 @or_trunc_i1(i64 %a1, i64 %a2) { ; CHECK-LABEL: @or_trunc_i1( -; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]] +; CHECK-NEXT: [[OR:%.*]] = icmp ne i64 [[A2:%.*]], [[A1:%.*]] ; CHECK-NEXT: ret i1 [[OR]] ; %xor = xor i64 %a2, %a1 @@ -1538,3 +1531,28 @@ define i1 @or_trunc_i1_wrong_operands(i64 %a1, i64 %a2, i64 %a3) { %or = or i1 %cmp, %trunc ret i1 %or } + +define i1 @jv_identical(i64 %arg1, i64 %arg2) { +; CHECK-LABEL: @jv_identical( +; CHECK-NEXT: [[ARG1_TRUNC:%.*]] = trunc i64 [[ARG1:%.*]] to i8 +; CHECK-NEXT: [[ARG2_TRUNC:%.*]] = trunc i64 [[ARG2:%.*]] to i8 +; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[ARG1_TRUNC]], [[ARG2_TRUNC]] +; CHECK-NEXT: [[DOTUNSHIFTED:%.*]] = xor i64 [[ARG2]], [[ARG1]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[DOTUNSHIFTED]], 65536 +; CHECK-NEXT: [[AND2:%.*]] = and i1 [[EQ1]], [[TMP1]] +; CHECK-NEXT: ret i1 [[AND2]] +; + %arg1.trunc = trunc i64 %arg1 to i8 + %arg1.shift = lshr i64 %arg1, 16 + %arg1.shift.trunc = trunc i64 %arg1.shift to i16 + %arg2.trunc = trunc i64 %arg2 to i8 + %arg2.shift = lshr i64 %arg2, 16 + %arg2.shift.trunc = trunc i64 %arg2.shift to i16 + %eq1 = icmp eq i8 %arg1.trunc, %arg2.trunc + %eq2 = icmp eq i16 %arg1.shift.trunc, %arg2.shift.trunc + %and1 = and i1 %eq1, %eq2 + %xor = xor i64 %arg2, %arg1 + %cmp = icmp ult i64 %xor, 4294967296 + %and2 = and i1 %cmp, %and1 + ret i1 %and2 +} diff --git a/llvm/test/Transforms/InstCombine/fptrunc.ll b/llvm/test/Transforms/InstCombine/fptrunc.ll index f46940ff060d4..a4296a326c4bc 100644 --- a/llvm/test/Transforms/InstCombine/fptrunc.ll +++ b/llvm/test/Transforms/InstCombine/fptrunc.ll @@ -90,6 +90,19 @@ define half @fptrunc_select_true_val_extra_use(half %x, float %y, i1 %cond) { ret half %r } +define half @fptrunc_max(half %arg) { +; CHECK-LABEL: @fptrunc_max( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt half [[ARG:%.*]], 0xH0000 +; CHECK-NEXT: [[NARROW_SEL:%.*]] = select i1 [[CMP]], half 0xH0000, half [[ARG]] +; CHECK-NEXT: ret half [[NARROW_SEL]] +; + %ext = fpext half %arg to double + %cmp = fcmp olt double %ext, 0.000000e+00 + %max = select i1 %cmp, double 0.000000e+00, double %ext + %trunc = fptrunc double %max to half + ret half %trunc +} + ; Negative test - this would require an extra instruction. define half @fptrunc_select_true_val_extra_use_2(half %x, float %y, i1 %cond) { diff --git a/llvm/test/Transforms/InstCombine/or-fcmp.ll b/llvm/test/Transforms/InstCombine/or-fcmp.ll index a2842f7a45f59..193fe4b5cc722 100644 --- a/llvm/test/Transforms/InstCombine/or-fcmp.ll +++ b/llvm/test/Transforms/InstCombine/or-fcmp.ll @@ -54,7 +54,7 @@ define i1 @PR41069(double %a, double %b, double %c, double %d) { ; CHECK-LABEL: @PR41069( ; CHECK-NEXT: [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[D:%.*]], [[C:%.*]] -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[UNO1]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[UNO1]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %uno1 = fcmp uno double %a, %b @@ -87,7 +87,7 @@ define i1 @PR41069_commute(double %a, double %b, double %c, double %d) { ; CHECK-LABEL: @PR41069_commute( ; CHECK-NEXT: [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = fcmp uno double [[D:%.*]], [[C:%.*]] -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[UNO1]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[UNO1]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %uno1 = fcmp uno double %a, %b @@ -4608,11 +4608,9 @@ define i1 @intersect_fmf_4(double %a, double %b) { define i1 @or_fcmp_reassoc1(i1 %x, double %a, double %b) { ; CHECK-LABEL: @or_fcmp_reassoc1( -; CHECK-NEXT: [[OR:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[A]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = or i1 [[OR]], [[CMP1:%.*]] -; CHECK-NEXT: [[RETVAL1:%.*]] = or i1 [[RETVAL]], [[CMP2]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp olt double %a, %b %cmp1 = fcmp ogt double %a, %b @@ -4623,11 +4621,9 @@ define i1 @or_fcmp_reassoc1(i1 %x, double %a, double %b) { define i1 @or_fcmp_reassoc2(i1 %x, double %a, double %b) { ; CHECK-LABEL: @or_fcmp_reassoc2( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = or i1 [[X:%.*]], [[TMP1]] -; CHECK-NEXT: [[RETVAL1:%.*]] = or i1 [[RETVAL]], [[CMP1]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp olt double %a, %b %cmp1 = fcmp ogt double %a, %b @@ -4638,11 +4634,9 @@ define i1 @or_fcmp_reassoc2(i1 %x, double %a, double %b) { define i1 @or_fcmp_reassoc3(i1 %x, double %a, double %b) { ; CHECK-LABEL: @or_fcmp_reassoc3( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = or i1 [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[RETVAL1:%.*]] = or i1 [[CMP1]], [[RETVAL]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp olt double %a, %b %cmp1 = fcmp ogt double %a, %b @@ -4653,11 +4647,9 @@ define i1 @or_fcmp_reassoc3(i1 %x, double %a, double %b) { define i1 @or_fcmp_reassoc4(i1 %x, double %a, double %b) { ; CHECK-LABEL: @or_fcmp_reassoc4( -; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[A]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp one double [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[RETVAL:%.*]] = or i1 [[X:%.*]], [[TMP1]] -; CHECK-NEXT: [[RETVAL1:%.*]] = or i1 [[CMP1]], [[RETVAL]] -; CHECK-NEXT: ret i1 [[RETVAL1]] +; CHECK-NEXT: ret i1 [[RETVAL]] ; %cmp = fcmp olt double %a, %b %cmp1 = fcmp ogt double %a, %b diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll index 405afd5969a41..5c9058b482320 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll @@ -276,21 +276,19 @@ if.false: ; preds = %if.true, %entry } ;; Both of successor 0 and successor 1 have a single predecessor. -;; TODO: Support transform for this case. -define void @single_predecessor(ptr %p, ptr %q, i32 %a) { +define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) { ; CHECK-LABEL: @single_predecessor( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0 -; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -; CHECK: common.ret: -; CHECK-NEXT: ret void -; CHECK: if.end: -; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: br label [[COMMON_RET:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q]], align 4 -; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4 -; CHECK-NEXT: br label [[COMMON_RET]] +; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TOBOOL]], true +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1> +; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32 +; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]]) +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3 +; CHECK-NEXT: ret i32 [[DOT]] ; entry: %tobool = icmp ne i32 %a, 0 @@ -298,12 +296,12 @@ entry: if.end: store i32 1, ptr %q - ret void + ret i32 2 if.then: %0 = load i32, ptr %q store i32 %0, ptr %p - ret void + ret i32 3 } ;; Hoist 6 stores. @@ -759,6 +757,44 @@ if.true: ret i32 %res } +;; Not transform if either BB has multiple successors. +define i32 @not_multi_successors(i1 %c1, i32 %c2, ptr %p) { +; CHECK-LABEL: @not_multi_successors( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[ENTRY_IF:%.*]], label [[COMMON_RET:%.*]] +; CHECK: entry.if: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: switch i32 [[C2:%.*]], label [[COMMON_RET]] [ +; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] +; CHECK-NEXT: i32 1, label [[SW_BB]] +; CHECK-NEXT: ] +; CHECK: common.ret: +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL]], [[ENTRY_IF]] ], [ 0, [[SW_BB]] ] +; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] +; CHECK: sw.bb: +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + br i1 %c1, label %entry.if, label %entry.else + +entry.if: ; preds = %entry + %val = load i32, ptr %p, align 4 + switch i32 %c2, label %return [ + i32 0, label %sw.bb + i32 1, label %sw.bb + ] + +entry.else: ; preds = %entry + ret i32 0 + +sw.bb: ; preds = %entry.if, %entry.if + br label %return + +return: ; preds = %sw.bb, %entry.if + %ret = phi i32 [ %val, %entry.if ], [ 0, %sw.bb ] + ret i32 %ret +} + declare i32 @read_memory_only() readonly nounwind willreturn speculatable !llvm.dbg.cu = !{!0} diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index c3f35e41b5fc7..7b9910e295df9 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -628,7 +628,7 @@ TEST(MemProf, RadixTreeBuilderEmpty) { FrameHistogram = llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes, FrameHistogram); ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty()); const auto Mappings = Builder.takeCallStackPos(); @@ -646,7 +646,7 @@ TEST(MemProf, RadixTreeBuilderOne) { FrameHistogram = llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes, FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ 3U, // Size of CS1, @@ -673,7 +673,7 @@ TEST(MemProf, RadixTreeBuilderTwo) { FrameHistogram = llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes, FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ @@ -711,7 +711,7 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) { FrameHistogram = llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + Builder.build(std::move(MemProfCallStackData), &MemProfFrameIndexes, FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h index 2ea589a7c4c3b..8b380751c2f9d 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h @@ -73,145 +73,6 @@ DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata"); #undef DEFINE_TRIVIAL_LLVM_TYPE -//===----------------------------------------------------------------------===// -// LLVMStructType. -//===----------------------------------------------------------------------===// - -/// LLVM dialect structure type representing a collection of different-typed -/// elements manipulated together. Structured can optionally be packed, meaning -/// that their elements immediately follow each other in memory without -/// accounting for potential alignment. -/// -/// Structure types can be identified (named) or literal. Literal structures -/// are uniquely represented by the list of types they contain and packedness. -/// Literal structure types are immutable after construction. -/// -/// Identified structures are uniquely represented by their name, a string. They -/// have a mutable component, consisting of the list of types they contain, -/// the packedness and the opacity bits. Identified structs can be created -/// without providing the lists of element types, making them suitable to -/// represent recursive, i.e. self-referring, structures. Identified structs -/// without body are considered opaque. For such structs, one can set the body. -/// Identified structs can be created as intentionally-opaque, implying that the -/// caller does not intend to ever set the body (e.g. forward-declarations of -/// structs from another module) and wants to disallow further modification of -/// the body. For intentionally-opaque structs or non-opaque structs with the -/// body, one is not allowed to set another body (however, one can set exactly -/// the same body). -/// -/// Note that the packedness of the struct takes place in uniquing of literal -/// structs, but does not in uniquing of identified structs. -class LLVMStructType - : public Type::TypeBase { -public: - /// Inherit base constructors. - using Base::Base; - - static constexpr StringLiteral name = "llvm.struct"; - - /// Checks if the given type can be contained in a structure type. - static bool isValidElementType(Type type); - - /// Gets or creates an identified struct with the given name in the provided - /// context. Note that unlike llvm::StructType::create, this function will - /// _NOT_ rename a struct in case a struct with the same name already exists - /// in the context. Instead, it will just return the existing struct, - /// similarly to the rest of MLIR type ::get methods. - static LLVMStructType getIdentified(MLIRContext *context, StringRef name); - static LLVMStructType - getIdentifiedChecked(function_ref emitError, - MLIRContext *context, StringRef name); - - /// Gets a new identified struct with the given body. The body _cannot_ be - /// changed later. If a struct with the given name already exists, renames - /// the struct by appending a `.` followed by a number to the name. Renaming - /// happens even if the existing struct has the same body. - static LLVMStructType getNewIdentified(MLIRContext *context, StringRef name, - ArrayRef elements, - bool isPacked = false); - - /// Gets or creates a literal struct with the given body in the provided - /// context. - static LLVMStructType getLiteral(MLIRContext *context, ArrayRef types, - bool isPacked = false); - static LLVMStructType - getLiteralChecked(function_ref emitError, - MLIRContext *context, ArrayRef types, - bool isPacked = false); - - /// Gets or creates an intentionally-opaque identified struct. Such a struct - /// cannot have its body set. To create an opaque struct with a mutable body, - /// use `getIdentified`. Note that unlike llvm::StructType::create, this - /// function will _NOT_ rename a struct in case a struct with the same name - /// already exists in the context. Instead, it will just return the existing - /// struct, similarly to the rest of MLIR type ::get methods. - static LLVMStructType getOpaque(StringRef name, MLIRContext *context); - static LLVMStructType - getOpaqueChecked(function_ref emitError, - MLIRContext *context, StringRef name); - - /// Set the body of an identified struct. Returns failure if the body could - /// not be set, e.g. if the struct already has a body or if it was marked as - /// intentionally opaque. This might happen in a multi-threaded context when a - /// different thread modified the struct after it was created. Most callers - /// are likely to assert this always succeeds, but it is possible to implement - /// a local renaming scheme based on the result of this call. - LogicalResult setBody(ArrayRef types, bool isPacked); - - /// Checks if a struct is packed. - bool isPacked() const; - - /// Checks if a struct is identified. - bool isIdentified() const; - - /// Checks if a struct is opaque. - bool isOpaque(); - - /// Checks if a struct is initialized. - bool isInitialized(); - - /// Returns the name of an identified struct. - StringRef getName(); - - /// Returns the list of element types contained in a non-opaque struct. - ArrayRef getBody() const; - - /// Verifies that the type about to be constructed is well-formed. - static LogicalResult - verifyInvariants(function_ref emitError, StringRef, - bool); - static LogicalResult - verifyInvariants(function_ref emitError, - ArrayRef types, bool); - using Base::verifyInvariants; - - /// Hooks for DataLayoutTypeInterface. Should not be called directly. Obtain a - /// DataLayout instance and query it instead. - llvm::TypeSize getTypeSizeInBits(const DataLayout &dataLayout, - DataLayoutEntryListRef params) const; - - uint64_t getABIAlignment(const DataLayout &dataLayout, - DataLayoutEntryListRef params) const; - - uint64_t getPreferredAlignment(const DataLayout &dataLayout, - DataLayoutEntryListRef params) const; - - bool areCompatible(DataLayoutEntryListRef oldLayout, - DataLayoutEntryListRef newLayout) const; - - LogicalResult verifyEntries(DataLayoutEntryListRef entries, - Location loc) const; - - /// Destructs the struct into its indexed field types. - std::optional> getSubelementIndexMap(); - - /// Returns which type is stored at a given integer index within the struct. - Type getTypeAtIndex(Attribute index); -}; - //===----------------------------------------------------------------------===// // Printing and parsing. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td index 09dd0919c318f..e88139fa5b28d 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td @@ -117,6 +117,140 @@ def LLVMFunctionType : LLVMType<"LLVMFunction", "func"> { }]; } +//===----------------------------------------------------------------------===// +// LLVMStructType +//===----------------------------------------------------------------------===// + +def LLVMStructType : LLVMType<"LLVMStruct", "struct", [ + MutableType, + DeclareTypeInterfaceMethods, + DeclareTypeInterfaceMethods +]> { + let summary = "LLVM struct type"; + + let description = [{ + LLVM dialect structure type representing a collection of different-typed + elements manipulated together. Struct types can optionally be packed, meaning + that their elements immediately follow each other in memory without + accounting for potential alignment. + + Structure types can be identified (named) or literal. Literal structures + are uniquely represented by the list of types they contain and packedness. + Literal structure types are immutable after construction. + + Identified structures are uniquely represented by their name, a string. They + have a mutable component, consisting of the list of types they contain, + the packedness and the opacity bits. Identified structs can be created + without providing the lists of element types, making them suitable to + represent recursive, i.e. self-referring, structures. Identified structs + without body are considered opaque. For such structs, one can set the body. + Identified structs can be created as intentionally-opaque, implying that the + caller does not intend to ever set the body (e.g. forward-declarations of + structs from another module) and wants to disallow further modification of + the body. For intentionally-opaque structs or non-opaque structs with the + body, one is not allowed to set another body (however, one can set exactly + the same body). + + Note that the packedness of the struct takes place in uniquing of literal + structs, but does not in uniquing of identified structs. + }]; + + // Specify parameters for which TableGen can generate convenient getters for + // us. + // TODO: Other parameters such as 'packed' or 'opaque' could be added in the + // future iff they generate getters prefixed with 'is', instead of + // 'get'. Until then there are no advantages in doing so. + let parameters = (ins + StringRefParameter<"struct name", [{""}]>:$name, + OptionalArrayRefParameter<"mlir::Type">:$body + ); + + // A custom storage class defined in C++ is required to implement mutability. + let storageClass = "LLVMStructTypeStorage"; + let genStorageClass = 0; + + // We want users to use the more aptly named custom builders below. + let skipDefaultBuilders = 1; + + let extraClassDeclaration = [{ + /// Checks if the given type can be contained in a structure type. + static bool isValidElementType(Type type); + + /// Gets or creates an identified struct with the given name in the provided + /// context. Note that unlike llvm::StructType::create, this function will + /// _NOT_ rename a struct in case a struct with the same name already exists + /// in the context. Instead, it will just return the existing struct, + /// similarly to the rest of MLIR type ::get methods. + static LLVMStructType getIdentified(MLIRContext *context, StringRef name); + static LLVMStructType + getIdentifiedChecked(function_ref emitError, + MLIRContext *context, StringRef name); + + /// Gets a new identified struct with the given body. The body _cannot_ be + /// changed later. If a struct with the given name already exists, renames + /// the struct by appending a `.` followed by a number to the name. Renaming + /// happens even if the existing struct has the same body. + static LLVMStructType getNewIdentified(MLIRContext *context, StringRef name, + ArrayRef elements, + bool isPacked = false); + + /// Gets or creates a literal struct with the given body in the provided + /// context. + static LLVMStructType getLiteral(MLIRContext *context, ArrayRef types, + bool isPacked = false); + + static LLVMStructType + getLiteralChecked(function_ref emitError, + MLIRContext *context, ArrayRef types, + bool isPacked = false); + + /// Gets or creates an intentionally-opaque identified struct. Such a struct + /// cannot have its body set. + /// Note that unlike llvm::StructType::create, this function will _NOT_ + /// rename a struct in case a struct with the same name + /// already exists in the context. Instead, it will just return the existing + /// struct, similarly to the rest of MLIR type ::get methods. + static LLVMStructType getOpaque(StringRef name, MLIRContext *context); + + static LLVMStructType + getOpaqueChecked(function_ref emitError, + MLIRContext *context, StringRef name); + + /// Set the body of an identified struct. Returns failure if the body could + /// not be set, e.g. if the struct already has a body or if it was marked as + /// intentionally opaque. This might happen in a multi-threaded context when a + /// different thread modified the struct after it was created. Most callers + /// are likely to assert this always succeeds, but it is possible to implement + /// a local renaming scheme based on the result of this call. + LogicalResult setBody(ArrayRef types, bool isPacked); + + /// Checks if a struct is packed. + bool isPacked() const; + + /// Checks if a struct is identified. + bool isIdentified() const; + + /// Checks if a struct is opaque. + bool isOpaque(); + + /// Checks if a struct is initialized. + bool isInitialized(); + + /// Verifies that the type about to be constructed is well-formed. + static LogicalResult + verifyInvariants(function_ref emitError, StringRef, + bool); + static LogicalResult + verifyInvariants(function_ref emitError, + ArrayRef types, bool); + using Base::verifyInvariants; + }]; + + let hasCustomAssemblyFormat = 1; +} + //===----------------------------------------------------------------------===// // LLVMPointerType //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/AttrTypeBase.td b/mlir/include/mlir/IR/AttrTypeBase.td index cbe4f0d67574b..38d38cf098df3 100644 --- a/mlir/include/mlir/IR/AttrTypeBase.td +++ b/mlir/include/mlir/IR/AttrTypeBase.td @@ -56,6 +56,9 @@ class ParamNativeTypeTrait class GenInternalTypeTrait : GenInternalTrait; class PredTypeTrait : PredTrait; +// Trait required to be added to any type which is mutable. +def MutableType : NativeTypeTrait<"IsMutable">; + //===----------------------------------------------------------------------===// // Builders //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp index 417c66b9165e3..cc9532f4e33b2 100644 --- a/mlir/lib/Bindings/Python/IRAttributes.cpp +++ b/mlir/lib/Bindings/Python/IRAttributes.cpp @@ -1102,11 +1102,11 @@ class PyDenseElementsAttribute unpackedBooleans = unpackedBooleans[py::slice(0, numBooleans, 1)]; unpackedBooleans = equalFunc(unpackedBooleans, 1); - std::vector shape; MlirType shapedType = mlirAttributeGetType(*this); intptr_t rank = mlirShapedTypeGetRank(shapedType); + std::vector shape(rank); for (intptr_t i = 0; i < rank; ++i) { - shape.push_back(mlirShapedTypeGetDimSize(shapedType, i)); + shape[i] = mlirShapedTypeGetDimSize(shapedType, i); } unpackedBooleans = reshapeFunc(unpackedBooleans, shape); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 9bb0c80749a5f..d30a6b8398f06 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3510,8 +3510,7 @@ void LLVMDialect::initialize() { LLVMPPCFP128Type, LLVMTokenType, LLVMLabelType, - LLVMMetadataType, - LLVMStructType>(); + LLVMMetadataType>(); // clang-format on registerTypes(); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp index 903035a3ec229..655316cc5d66d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp @@ -1566,7 +1566,7 @@ DeletionKind LLVM::MemmoveOp::rewire(const DestructurableMemorySlot &slot, //===----------------------------------------------------------------------===// std::optional> -LLVM::LLVMStructType::getSubelementIndexMap() { +LLVM::LLVMStructType::getSubelementIndexMap() const { Type i32 = IntegerType::get(getContext(), 32); DenseMap destructured; for (const auto &[index, elemType] : llvm::enumerate(getBody())) @@ -1574,7 +1574,7 @@ LLVM::LLVMStructType::getSubelementIndexMap() { return destructured; } -Type LLVM::LLVMStructType::getTypeAtIndex(Attribute index) { +Type LLVM::LLVMStructType::getTypeAtIndex(Attribute index) const { auto indexAttr = llvm::dyn_cast(index); if (!indexAttr || !indexAttr.getType().isInteger(32)) return {}; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp index 1bed3fa48b30d..33c231e2d2045 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp @@ -485,7 +485,7 @@ bool LLVMStructType::isOpaque() { (getImpl()->isOpaque() || !getImpl()->isInitialized()); } bool LLVMStructType::isInitialized() { return getImpl()->isInitialized(); } -StringRef LLVMStructType::getName() { return getImpl()->getIdentifier(); } +StringRef LLVMStructType::getName() const { return getImpl()->getIdentifier(); } ArrayRef LLVMStructType::getBody() const { return isIdentified() ? getImpl()->getIdentifiedStructBody() : getImpl()->getTypeList(); diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 5acd095da8e38..710c976281dc3 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -434,23 +434,25 @@ class MoveBlockRewrite : public BlockRewrite { class BlockTypeConversionRewrite : public BlockRewrite { public: BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl, - Block *block, Block *origBlock) - : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block), - origBlock(origBlock) {} + Block *origBlock, Block *newBlock) + : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, origBlock), + newBlock(newBlock) {} static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::BlockTypeConversion; } - Block *getOrigBlock() const { return origBlock; } + Block *getOrigBlock() const { return block; } + + Block *getNewBlock() const { return newBlock; } void commit(RewriterBase &rewriter) override; void rollback() override; private: - /// The original block that was requested to have its signature converted. - Block *origBlock; + /// The new block that was created as part of this signature conversion. + Block *newBlock; }; /// Replacing a block argument. This rewrite is not immediately reflected in the @@ -721,6 +723,18 @@ static bool hasRewrite(R &&rewrites, Operation *op) { }); } +#ifndef NDEBUG +/// Return "true" if there is a block rewrite that matches the specified +/// rewrite type and block among the given rewrites. +template +static bool hasRewrite(R &&rewrites, Block *block) { + return any_of(std::forward(rewrites), [&](auto &rewrite) { + auto *rewriteTy = dyn_cast(rewrite.get()); + return rewriteTy && rewriteTy->getBlock() == block; + }); +} +#endif // NDEBUG + //===----------------------------------------------------------------------===// // ConversionPatternRewriterImpl //===----------------------------------------------------------------------===// @@ -966,12 +980,12 @@ void BlockTypeConversionRewrite::commit(RewriterBase &rewriter) { // block. if (auto *listener = dyn_cast_or_null(rewriter.getListener())) - for (Operation *op : block->getUsers()) + for (Operation *op : getNewBlock()->getUsers()) listener->notifyOperationModified(op); } void BlockTypeConversionRewrite::rollback() { - block->replaceAllUsesWith(origBlock); + getNewBlock()->replaceAllUsesWith(getOrigBlock()); } void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) { @@ -1223,6 +1237,9 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( ConversionPatternRewriter &rewriter, Block *block, const TypeConverter *converter, TypeConverter::SignatureConversion &signatureConversion) { + // A block cannot be converted multiple times. + assert(!hasRewrite(rewrites, block) && + "block was already converted"); OpBuilder::InsertionGuard g(rewriter); // If no arguments are being changed or added, there is nothing to do. @@ -1308,7 +1325,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( appendRewrite(block, origArg, converter); } - appendRewrite(newBlock, block); + appendRewrite(/*origBlock=*/block, newBlock); // Erase the old block. (It is just unlinked for now and will be erased during // cleanup.) diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td index 830475bed4e44..60108ac86d1ed 100644 --- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td +++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td @@ -375,7 +375,7 @@ def TestI32 : Test_Type<"TestI32"> { } def TestRecursiveAlias - : Test_Type<"TestRecursiveAlias", [NativeTypeTrait<"IsMutable">]> { + : Test_Type<"TestRecursiveAlias", [MutableType]> { let mnemonic = "test_rec_alias"; let storageClass = "TestRecursiveTypeStorage"; let storageNamespace = "test"; diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 74f13788ab29f..51d72d2e5f5b2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4886,7 +4886,9 @@ cc_library( ":FuncExtensions", ":FuncToLLVM", ":FuncTransformOps", + ":GPUToGPURuntimeTransforms", ":GPUToLLVMIRTranslation", + ":GPUToNVVMTransforms", ":GPUTransformOps", ":IndexToLLVM", ":LLVMToLLVMIRTranslation", @@ -5907,6 +5909,7 @@ cc_library( ":ControlFlowDialect", ":ControlFlowToLLVM", ":ConversionPassIncGen", + ":ConvertToLLVMInterface", ":FuncDialect", ":FuncToLLVM", ":GPUCommonTransforms", @@ -6088,6 +6091,7 @@ cc_library( hdrs = [ "include/mlir/Conversion/GPUCommon/AttrToSPIRVConverter.h", "include/mlir/Conversion/GPUCommon/GPUCommonPass.h", + "include/mlir/Conversion/GPUCommon/GPUToLLVM.h", "lib/Conversion/GPUCommon/GPUOpsLowering.h", ], includes = ["include"], @@ -8374,6 +8378,31 @@ cc_library( ], ) +gentbl_cc_library( + name = "ToLLVMInterfaceIncGen", + tbl_outs = [ + ( + ["--gen-attr-interface-decls"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.h.inc", + ), + ( + ["--gen-attr-interface-defs"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.cpp.inc", + ), + ( + ["--gen-op-interface-decls"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.h.inc", + ), + ( + ["--gen-op-interface-defs"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Conversion/ConvertToLLVM/ToLLVMInterface.td", + deps = [":UBDialectTdFiles"], +) + cc_library( name = "ConvertToLLVMInterface", srcs = ["lib/Conversion/ConvertToLLVM/ToLLVMInterface.cpp"], @@ -8382,6 +8411,7 @@ cc_library( deps = [ ":IR", ":Support", + ":ToLLVMInterfaceIncGen", "//llvm:Support", ], ) @@ -8394,6 +8424,7 @@ cc_library( deps = [ ":ConversionPassIncGen", ":ConvertToLLVMInterface", + ":Analysis", ":IR", ":LLVMCommonConversion", ":LLVMDialect",